From cb666596283a96db2b66e72c79f773add871020f Mon Sep 17 00:00:00 2001 From: Jing Chen Date: Tue, 21 Apr 2026 18:34:00 -0400 Subject: [PATCH 01/24] Initial accuracy analysis Signed-off-by: Jing Chen --- accuracy/k8s/configmap-scripts.yaml | 17 + accuracy/k8s/configmap-sweep.yaml | 15 + accuracy/k8s/hf-secret.yaml | 19 + accuracy/k8s/namespace.yaml | 4 + accuracy/k8s/orchestrator-job.yaml | 46 ++ accuracy/k8s/pvc.yaml | 14 + accuracy/k8s/rbac.yaml | 35 ++ accuracy/k8s/vllm-job-reference.yaml | 71 ++++ accuracy/results/v0.19.0/deep_analysis.md | 333 +++++++++++++++ .../results/v0.19.0/parameter_sensitivity.md | 217 ++++++++++ accuracy/results/v0.19.0/report.md | 142 +++++++ accuracy/results/v0.19.0/results.csv | 65 +++ ...pseek-v2---h100-80gb--tp1pp1dp1--8192.json | 25 ++ ...pseek-v2---h100-80gb--tp2pp1dp1--8192.json | 25 ++ ...pseek-v2---h100-80gb--tp4pp1dp1--8192.json | 25 ++ ...a-3-3-70b--h100-80gb--tp2pp1dp1--8192.json | 26 ++ ...a-3-3-70b--h100-80gb--tp4pp1dp1--8192.json | 26 ++ ...nite-3-1---h100-80gb--tp1pp1dp1--8192.json | 25 ++ ...nite-3-1---h100-80gb--tp2pp1dp1--8192.json | 25 ++ ...nite-3-1---h100-80gb--tp4pp1dp1--8192.json | 25 ++ ...nite-3-3---h100-80gb--tp1pp1dp1--8192.json | 25 ++ ...nite-3-3---h100-80gb--tp2pp1dp1--8192.json | 25 ++ ...nite-3-3---h100-80gb--tp4pp1dp1--8192.json | 25 ++ ...nite-visi--h100-80gb--tp1pp1dp1--8192.json | 25 ++ ...nite-visi--h100-80gb--tp2pp1dp1--8192.json | 25 ++ ...00-80gb--tp1pp1dp1--8192-dtbf16-kvfp8.json | 26 ++ ...100-80gb--tp1pp1dp1--8192-dtf16-kvfp8.json | 4 + ...a---h100-80gb--tp1pp1dp1--8192-dtbf16.json | 26 ++ ...a-3--h100-80gb--tp1pp1dp1--8192-dtf16.json | 26 ++ ...a-3--h100-80gb--tp1pp1dp1--8192-dtf32.json | 26 ++ ...a-3-1-8b---h100-80gb--tp1pp1dp1--2048.json | 26 ++ ...a-3-1-8b---h100-80gb--tp1pp1dp1--4096.json | 26 ++ ...a-3-1-8b---h100-80gb--tp1pp1dp1--8192.json | 25 ++ ...a-3-1-8b---h100-80gb--tp1pp2dp1--8192.json | 26 ++ ...a-3-1-8b---h100-80gb--tp1pp4dp1--8192.json | 26 ++ ...a-3-1-8b---h100-80gb--tp2pp1dp1--8192.json | 25 ++ ...a-3-1-8b---h100-80gb--tp3pp1dp1--8192.json | 4 + ...a-3-1-8b---h100-80gb--tp4pp1dp1--8192.json | 25 ++ ...a-3-1-8b--h100-80gb--tp1pp1dp1--16384.json | 1 + ...a-3-1-8b--h100-80gb--tp1pp1dp1--32768.json | 26 ++ ...a-4-scout--h100-80gb--tp1pp1dp1--8192.json | 1 + ...a-4-scout--h100-80gb--tp2pp1dp1--8192.json | 1 + ...a-4-scout--h100-80gb--tp4pp1dp1--8192.json | 25 ++ ...oft-phi-4--h100-80gb--tp1pp1dp1--8192.json | 25 ++ ...oft-phi-4--h100-80gb--tp2pp1dp1--8192.json | 25 ++ ...oft-phi-4--h100-80gb--tp4pp1dp1--8192.json | 1 + ...al-small---h100-80gb--tp1pp1dp1--8192.json | 25 ++ ...al-small---h100-80gb--tp2pp1dp1--8192.json | 25 ++ ...al-small---h100-80gb--tp4pp1dp1--8192.json | 25 ++ ...al-8x7b-i--h100-80gb--tp1pp1dp1--8192.json | 1 + ...al-8x7b-i--h100-80gb--tp2pp1dp1--8192.json | 25 ++ ...al-8x7b-i--h100-80gb--tp4pp1dp1--8192.json | 25 ++ ...i-dev-72b--h100-80gb--tp2pp1dp1--8192.json | 25 ++ ...i-dev-72b--h100-80gb--tp4pp1dp1--8192.json | 25 ++ ...-vl-a3b-i--h100-80gb--tp1pp1dp1--8192.json | 25 ++ ...-vl-a3b-i--h100-80gb--tp2pp1dp1--8192.json | 25 ++ ...-oss-120b--h100-80gb--tp4pp1dp1--8192.json | 1 + ...-oss-120b--h100-80gb--tp8pp1dp1--8192.json | 1 + ...t-oss-20b--h100-80gb--tp4pp1dp1--8192.json | 25 ++ ...n-7b-chat--h100-80gb--tp1pp1dp1--8192.json | 1 + ...n-7b-chat--h100-80gb--tp2pp1dp1--8192.json | 1 + ...n-7b-chat--h100-80gb--tp4pp1dp1--8192.json | 1 + ...00-80gb--tp1pp1dp1--8192-dtbf16-kvfp8.json | 26 ++ ...100-80gb--tp1pp1dp1--8192-dtf16-kvfp8.json | 4 + ...b-instruc--h100-80gb--tp2pp1dp1--8192.json | 25 ++ ...b-instruc--h100-80gb--tp4pp1dp1--8192.json | 25 ++ ...b-instruc--h100-80gb--tp8pp1dp1--8192.json | 1 + ...-i--h100-80gb--tp1pp1dp1--8192-dtbf16.json | 26 ++ ...-in--h100-80gb--tp1pp1dp1--8192-dtf16.json | 26 ++ ...-instruc--h100-80gb--tp1pp1dp1--16384.json | 26 ++ ...-instruc--h100-80gb--tp1pp1dp1--32768.json | 26 ++ ...-instruct--h100-80gb--tp1pp1dp1--2048.json | 26 ++ ...-instruct--h100-80gb--tp1pp1dp1--4096.json | 26 ++ ...-instruct--h100-80gb--tp1pp1dp1--8192.json | 25 ++ ...-instruct--h100-80gb--tp2pp1dp1--8192.json | 25 ++ ...-instruct--h100-80gb--tp4pp1dp1--8192.json | 25 ++ ...3-30b-a3b--h100-80gb--tp1pp1dp1--8192.json | 25 ++ ...3-30b-a3b--h100-80gb--tp2pp1dp1--8192.json | 1 + ...3-30b-a3b--h100-80gb--tp4pp1dp1--8192.json | 25 ++ ...-qwen3-8b--h100-80gb--tp1pp1dp1--8192.json | 25 ++ ...-qwen3-8b--h100-80gb--tp2pp1dp1--8192.json | 1 + ...-qwen3-8b--h100-80gb--tp4pp1dp1--8192.json | 25 ++ ...h100-80gb--tp1pp1dp1--8192-dtf16-qawq.json | 4 + ...h100-80gb--tp1pp1dp1--8192-dtf16-qfp8.json | 4 + ...3-1--h100-80gb--tp1pp1dp1--8192-dtf16.json | 26 ++ ...3-3-70b-i--h100-80gb--tp1pp1dp1--8192.json | 25 ++ ...3-3-70b-i--h100-80gb--tp2pp1dp1--8192.json | 25 ++ ...3-3-70b-i--h100-80gb--tp4pp1dp1--8192.json | 25 ++ ...lam--h100-80gb--tp1pp1dp1--8192-dtf16.json | 26 ++ ...lla--h100-80gb--tp1pp1dp1--8192-dtf16.json | 26 ++ ...small-24b--h100-80gb--tp1pp1dp1--8192.json | 26 ++ ...small-24b--h100-80gb--tp2pp1dp1--8192.json | 26 ++ ...lam--h100-80gb--tp1pp1dp1--8192-dtf16.json | 26 ++ ...wen2-5-7b--h100-80gb--tp1pp1dp1--8192.json | 26 ++ accuracy/scripts/analyze.py | 276 ++++++++++++ accuracy/scripts/collect.py | 153 +++++++ accuracy/scripts/deep_analysis.py | 397 ++++++++++++++++++ accuracy/scripts/parse_log.py | 100 +++++ accuracy/scripts/sweep.yaml | 123 ++++++ accuracy/scripts/sweep_runner.py | 346 +++++++++++++++ .../tests/fixtures/llama_tp1_expected.json | 11 + accuracy/tests/test_analyze.py | 83 ++++ accuracy/tests/test_collect.py | 79 ++++ accuracy/tests/test_parse_log.py | 74 ++++ accuracy/tests/test_sweep_runner.py | 141 +++++++ 105 files changed, 4419 insertions(+) create mode 100644 accuracy/k8s/configmap-scripts.yaml create mode 100644 accuracy/k8s/configmap-sweep.yaml create mode 100644 accuracy/k8s/hf-secret.yaml create mode 100644 accuracy/k8s/namespace.yaml create mode 100644 accuracy/k8s/orchestrator-job.yaml create mode 100644 accuracy/k8s/pvc.yaml create mode 100644 accuracy/k8s/rbac.yaml create mode 100644 accuracy/k8s/vllm-job-reference.yaml create mode 100644 accuracy/results/v0.19.0/deep_analysis.md create mode 100644 accuracy/results/v0.19.0/parameter_sensitivity.md create mode 100644 accuracy/results/v0.19.0/report.md create mode 100644 accuracy/results/v0.19.0/results.csv create mode 100644 accuracy/results/v0.19.0/runs/deepseek-ai-deepseek-v2---h100-80gb--tp1pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/deepseek-ai-deepseek-v2---h100-80gb--tp2pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/deepseek-ai-deepseek-v2---h100-80gb--tp4pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/fp8dyn-llama-3-3-70b--h100-80gb--tp2pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/fp8dyn-llama-3-3-70b--h100-80gb--tp4pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/ibm-granite-granite-3-1---h100-80gb--tp1pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/ibm-granite-granite-3-1---h100-80gb--tp2pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/ibm-granite-granite-3-1---h100-80gb--tp4pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/ibm-granite-granite-3-3---h100-80gb--tp1pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/ibm-granite-granite-3-3---h100-80gb--tp2pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/ibm-granite-granite-3-3---h100-80gb--tp4pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/ibm-granite-granite-visi--h100-80gb--tp1pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/ibm-granite-granite-visi--h100-80gb--tp2pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/meta-llama---h100-80gb--tp1pp1dp1--8192-dtbf16-kvfp8.json create mode 100644 accuracy/results/v0.19.0/runs/meta-llama-l--h100-80gb--tp1pp1dp1--8192-dtf16-kvfp8.json create mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama---h100-80gb--tp1pp1dp1--8192-dtbf16.json create mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-3--h100-80gb--tp1pp1dp1--8192-dtf16.json create mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-3--h100-80gb--tp1pp1dp1--8192-dtf32.json create mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp1dp1--2048.json create mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp1dp1--4096.json create mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp2dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp4dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp2pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp3pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp4pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b--h100-80gb--tp1pp1dp1--16384.json create mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b--h100-80gb--tp1pp1dp1--32768.json create mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-4-scout--h100-80gb--tp1pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-4-scout--h100-80gb--tp2pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-4-scout--h100-80gb--tp4pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/microsoft-phi-4--h100-80gb--tp1pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/microsoft-phi-4--h100-80gb--tp2pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/microsoft-phi-4--h100-80gb--tp4pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/mistralai-mistral-small---h100-80gb--tp1pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/mistralai-mistral-small---h100-80gb--tp2pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/mistralai-mistral-small---h100-80gb--tp4pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/mistralai-mixtral-8x7b-i--h100-80gb--tp1pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/mistralai-mixtral-8x7b-i--h100-80gb--tp2pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/mistralai-mixtral-8x7b-i--h100-80gb--tp4pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/moonshotai-kimi-dev-72b--h100-80gb--tp2pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/moonshotai-kimi-dev-72b--h100-80gb--tp4pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/moonshotai-kimi-vl-a3b-i--h100-80gb--tp1pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/moonshotai-kimi-vl-a3b-i--h100-80gb--tp2pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/openai-gpt-oss-120b--h100-80gb--tp4pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/openai-gpt-oss-120b--h100-80gb--tp8pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/openai-gpt-oss-20b--h100-80gb--tp4pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen-7b-chat--h100-80gb--tp1pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen-7b-chat--h100-80gb--tp2pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen-7b-chat--h100-80gb--tp4pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen2---h100-80gb--tp1pp1dp1--8192-dtbf16-kvfp8.json create mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen2-5--h100-80gb--tp1pp1dp1--8192-dtf16-kvfp8.json create mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen2-5-72b-instruc--h100-80gb--tp2pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen2-5-72b-instruc--h100-80gb--tp4pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen2-5-72b-instruc--h100-80gb--tp8pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-i--h100-80gb--tp1pp1dp1--8192-dtbf16.json create mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-in--h100-80gb--tp1pp1dp1--8192-dtf16.json create mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruc--h100-80gb--tp1pp1dp1--16384.json create mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruc--h100-80gb--tp1pp1dp1--32768.json create mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruct--h100-80gb--tp1pp1dp1--2048.json create mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruct--h100-80gb--tp1pp1dp1--4096.json create mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruct--h100-80gb--tp1pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruct--h100-80gb--tp2pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruct--h100-80gb--tp4pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen3-30b-a3b--h100-80gb--tp1pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen3-30b-a3b--h100-80gb--tp2pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen3-30b-a3b--h100-80gb--tp4pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen3-8b--h100-80gb--tp1pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen3-8b--h100-80gb--tp2pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen3-8b--h100-80gb--tp4pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/redhatai-llam--h100-80gb--tp1pp1dp1--8192-dtf16-qawq.json create mode 100644 accuracy/results/v0.19.0/runs/redhatai-llam--h100-80gb--tp1pp1dp1--8192-dtf16-qfp8.json create mode 100644 accuracy/results/v0.19.0/runs/redhatai-llama-3-1--h100-80gb--tp1pp1dp1--8192-dtf16.json create mode 100644 accuracy/results/v0.19.0/runs/redhatai-llama-3-3-70b-i--h100-80gb--tp1pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/redhatai-llama-3-3-70b-i--h100-80gb--tp2pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/redhatai-llama-3-3-70b-i--h100-80gb--tp4pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/redhatai-meta-llam--h100-80gb--tp1pp1dp1--8192-dtf16.json create mode 100644 accuracy/results/v0.19.0/runs/w4a16-redhatai-lla--h100-80gb--tp1pp1dp1--8192-dtf16.json create mode 100644 accuracy/results/v0.19.0/runs/w8a8-mistral-small-24b--h100-80gb--tp1pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/w8a8-mistral-small-24b--h100-80gb--tp2pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/w8a8-redhatai-llam--h100-80gb--tp1pp1dp1--8192-dtf16.json create mode 100644 accuracy/results/v0.19.0/runs/w8a8-redhatai-qwen2-5-7b--h100-80gb--tp1pp1dp1--8192.json create mode 100644 accuracy/scripts/analyze.py create mode 100644 accuracy/scripts/collect.py create mode 100644 accuracy/scripts/deep_analysis.py create mode 100644 accuracy/scripts/parse_log.py create mode 100644 accuracy/scripts/sweep.yaml create mode 100644 accuracy/scripts/sweep_runner.py create mode 100644 accuracy/tests/fixtures/llama_tp1_expected.json create mode 100644 accuracy/tests/test_analyze.py create mode 100644 accuracy/tests/test_collect.py create mode 100644 accuracy/tests/test_parse_log.py create mode 100644 accuracy/tests/test_sweep_runner.py diff --git a/accuracy/k8s/configmap-scripts.yaml b/accuracy/k8s/configmap-scripts.yaml new file mode 100644 index 00000000..51ed4c68 --- /dev/null +++ b/accuracy/k8s/configmap-scripts.yaml @@ -0,0 +1,17 @@ +# accuracy/k8s/configmap-scripts.yaml +# Generated — do not edit directly. +# Regenerate with: +# kubectl create configmap vllm-mem-scripts \ +# --from-file=sweep_runner.py=accuracy/scripts/sweep_runner.py \ +# --from-file=parse_log.py=accuracy/scripts/parse_log.py \ +# --namespace llmdplanner --dry-run=client -o yaml > accuracy/k8s/configmap-scripts.yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: vllm-mem-scripts + namespace: llmdplanner +data: + sweep_runner.py: | + # Contents generated by the kubectl command above. + parse_log.py: | + # Contents generated by the kubectl command above. diff --git a/accuracy/k8s/configmap-sweep.yaml b/accuracy/k8s/configmap-sweep.yaml new file mode 100644 index 00000000..5344b198 --- /dev/null +++ b/accuracy/k8s/configmap-sweep.yaml @@ -0,0 +1,15 @@ +# accuracy/k8s/configmap-sweep.yaml +# Generated from accuracy/scripts/sweep.yaml. +# Sync with: kubectl create configmap vllm-mem-sweep \ +# --from-file=sweep.yaml=accuracy/scripts/sweep.yaml \ +# --namespace llmdplanner --dry-run=client -o yaml \ +# | kubectl apply -f - +apiVersion: v1 +kind: ConfigMap +metadata: + name: vllm-mem-sweep + namespace: llmdplanner +data: + sweep.yaml: | + # Contents of accuracy/scripts/sweep.yaml go here. + # Use the kubectl command above to generate and keep in sync. diff --git a/accuracy/k8s/hf-secret.yaml b/accuracy/k8s/hf-secret.yaml new file mode 100644 index 00000000..a3034607 --- /dev/null +++ b/accuracy/k8s/hf-secret.yaml @@ -0,0 +1,19 @@ +# This file is a template — the token value is never committed to the repo. +# +# Create the Secret before running the campaign: +# +# kubectl create secret generic hf-token \ +# --from-literal=token=hf_YOUR_TOKEN_HERE \ +# --namespace llmdplanner +# +# Or apply this file after substituting the token: +# HF_TOKEN=hf_xxx envsubst < accuracy/k8s/hf-secret.yaml | kubectl apply -f - +# +apiVersion: v1 +kind: Secret +metadata: + name: hf-token + namespace: llmdplanner +type: Opaque +stringData: + token: "${HF_TOKEN}" # NEVER commit a real token here diff --git a/accuracy/k8s/namespace.yaml b/accuracy/k8s/namespace.yaml new file mode 100644 index 00000000..9bd1d5ca --- /dev/null +++ b/accuracy/k8s/namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: llmdplanner diff --git a/accuracy/k8s/orchestrator-job.yaml b/accuracy/k8s/orchestrator-job.yaml new file mode 100644 index 00000000..ab90f198 --- /dev/null +++ b/accuracy/k8s/orchestrator-job.yaml @@ -0,0 +1,46 @@ +# Submit with: kubectl apply -f accuracy/k8s/orchestrator-job.yaml +# Monitor with: kubectl logs -f job/vllm-mem-orchestrator -n llmdplanner +apiVersion: batch/v1 +kind: Job +metadata: + name: vllm-mem-orchestrator + namespace: llmdplanner +spec: + backoffLimit: 0 + activeDeadlineSeconds: 86400 # 24-hour hard cap for the full sweep + template: + spec: + serviceAccountName: vllm-mem-orchestrator + restartPolicy: Never + volumes: + - name: data + persistentVolumeClaim: + claimName: vllm-mem-data + - name: scripts + configMap: + name: vllm-mem-scripts + defaultMode: 0755 + - name: sweep + configMap: + name: vllm-mem-sweep + containers: + - name: orchestrator + image: python:3.11-slim + command: ["/bin/bash", "-c"] + args: + - | + pip install pyyaml kubernetes --quiet --no-cache-dir && + python /scripts/sweep_runner.py \ + --config /sweep/sweep.yaml \ + --results /data/results/ + volumeMounts: + - name: data + mountPath: /data + - name: scripts + mountPath: /scripts + - name: sweep + mountPath: /sweep + resources: + requests: + cpu: "500m" + memory: "512Mi" diff --git a/accuracy/k8s/pvc.yaml b/accuracy/k8s/pvc.yaml new file mode 100644 index 00000000..eb26ecdf --- /dev/null +++ b/accuracy/k8s/pvc.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: vllm-mem-data + namespace: llmdplanner +spec: + accessModes: + - ReadWriteMany # ReadWriteOnce also works if sub-jobs run sequentially on the same node + resources: + requests: + storage: 500Gi + # storageClassName is intentionally omitted to use the cluster's default. + # To use a specific class: uncomment and set below. + # storageClassName: standard diff --git a/accuracy/k8s/rbac.yaml b/accuracy/k8s/rbac.yaml new file mode 100644 index 00000000..cab770e9 --- /dev/null +++ b/accuracy/k8s/rbac.yaml @@ -0,0 +1,35 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: vllm-mem-orchestrator + namespace: llmdplanner +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: vllm-mem-orchestrator + namespace: llmdplanner +rules: + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["create", "get", "list", "watch", "delete"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["pods/log"] + verbs: ["get"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: vllm-mem-orchestrator + namespace: llmdplanner +subjects: + - kind: ServiceAccount + name: vllm-mem-orchestrator + namespace: llmdplanner +roleRef: + kind: Role + name: vllm-mem-orchestrator + apiGroup: rbac.authorization.k8s.io diff --git a/accuracy/k8s/vllm-job-reference.yaml b/accuracy/k8s/vllm-job-reference.yaml new file mode 100644 index 00000000..accd13c7 --- /dev/null +++ b/accuracy/k8s/vllm-job-reference.yaml @@ -0,0 +1,71 @@ +# REFERENCE ONLY — not applied directly. The orchestrator builds equivalent Job +# objects at runtime using the kubernetes Python client. +apiVersion: batch/v1 +kind: Job +metadata: + name: "vllm-mem-" # e.g., vllm-mem-llama3-1-8b--h100-80gb--tp1pp1dp1--8192 + namespace: llmdplanner + labels: + app: vllm-mem-validation + run-id: "" +spec: + backoffLimit: 0 # no retries — failures need investigation + activeDeadlineSeconds: 3600 # 1-hour hard timeout per run + template: + metadata: + labels: + app: vllm-mem-validation + run-id: "" + spec: + restartPolicy: Never + volumes: + - name: data + persistentVolumeClaim: + claimName: vllm-mem-data + containers: + - name: vllm + image: vllm/vllm-openai:v0.19.0 + command: ["vllm", "serve", ""] + args: + - "--tensor-parallel-size=" + - "--pipeline-parallel-size=" + - "--data-parallel-size=" + - "--gpu-memory-utilization=0.95" + - "--max-model-len=" + - "--no-enable-prefix-caching" + - "--disable-log-requests" + env: + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token + key: token + - name: HF_HOME + value: /data/models # reuse cached weights across runs + resources: + limits: + nvidia.com/gpu: # = TP × PP + requests: + nvidia.com/gpu: + volumeMounts: + - name: data + mountPath: /data + # startupProbe: polls /health until vLLM is ready to serve. + # The orchestrator watches for this to pass (container.ready = True) + # then fetches the full pod log and deletes the Job. + # failureThreshold × periodSeconds = 30 min max startup window. + startupProbe: + httpGet: + path: /health + port: 8000 + initialDelaySeconds: 60 # vLLM needs at least ~60s to begin loading + periodSeconds: 10 + failureThreshold: 180 # 180 × 10s = 30 minutes + successThreshold: 1 + nodeSelector: + # CHANGE THIS to match your cluster's GPU node label. + nvidia.com/gpu.product: NVIDIA-H100-80GB-HBM3 + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule diff --git a/accuracy/results/v0.19.0/deep_analysis.md b/accuracy/results/v0.19.0/deep_analysis.md new file mode 100644 index 00000000..f6e07682 --- /dev/null +++ b/accuracy/results/v0.19.0/deep_analysis.md @@ -0,0 +1,333 @@ +# Capacity Planner — Deep Accuracy Analysis + +_vLLM v0.19.0 · H100-80GB · 64 runs · 23 models_ + +## Executive Summary + +**Runs analyzed**: 64 across 23 models on 1 GPU type(s). + +### Overall accuracy + +| Cohort | N | Mean err | MAE | Min / Max | ≤5% | ≤10% | +|---|---|---|---|---|---|---| +| Weight memory | 64 | -1.8% | +1.8% | -50.1% / 0.0% | 94% | 95% | +| KV cache memory | 64 | +1.9% | +7.6% | -32.7% / +61.9% | 66% | 89% | + +## By architecture type + +| Cohort | N | Mean err | MAE | Min / Max | ≤5% | ≤10% | +|---|---|---|---|---|---|---| +| **Dense** — weight | 49 | -1.8% | +1.8% | -50.1% / 0.0% | 94% | 96% | +| **Dense** — KV | 49 | +0.9% | +6.2% | -32.7% / +60.9% | 71% | 94% | +| **MoE** — weight | 11 | -2.0% | +2.0% | -11.8% / -0.0% | 91% | 91% | +| **MoE** — KV | 11 | +7.5% | +15.2% | -28.7% / +61.9% | 36% | 64% | +| **Multimodal** — weight | 4 | -0.7% | +0.7% | -1.6% / 0.0% | 100% | 100% | +| **Multimodal** — KV | 4 | -1.5% | +4.0% | -9.8% / +2.6% | 75% | 100% | + +## Per-model-family accuracy + +| Cohort | N | Mean err | MAE | Min / Max | ≤5% | ≤10% | +|---|---|---|---|---|---|---| +| **DeepSeek** — weight | 3 | -1.5% | +1.5% | -2.7% / -0.6% | 100% | 100% | +| **DeepSeek** — KV | 3 | -2.4% | +5.3% | -11.5% / +3.7% | 67% | 67% | +| **GPT-OSS (openai)** — weight | 1 | -11.8% | +11.8% | -11.8% / -11.8% | 0% | 0% | +| **GPT-OSS (openai)** — KV | 1 | +5.5% | +5.5% | +5.5% / +5.5% | 0% | 100% | +| **Granite** — weight | 6 | -0.9% | +0.9% | -1.8% / -0.2% | 100% | 100% | +| **Granite** — KV | 6 | -0.8% | +2.9% | -6.0% / +2.7% | 67% | 100% | +| **Granite-Vision** — weight | 2 | -0.4% | +0.4% | -0.7% / 0.0% | 100% | 100% | +| **Granite-Vision** — KV | 2 | +0.7% | +1.9% | -1.2% / +2.6% | 100% | 100% | +| **Kimi** — weight | 2 | -0.3% | +0.3% | -0.4% / -0.2% | 100% | 100% | +| **Kimi** — KV | 2 | +35.6% | +35.6% | +9.3% / +61.9% | 0% | 50% | +| **Kimi-VL** — weight | 2 | -1.1% | +1.1% | -1.6% / -0.6% | 100% | 100% | +| **Kimi-VL** — KV | 2 | -3.7% | +6.1% | -9.8% / +2.4% | 50% | 100% | +| **Llama-3.1** — weight | 16 | -4.2% | +4.2% | -50.1% / -0.2% | 88% | 88% | +| **Llama-3.1** — KV | 16 | +0.2% | +4.8% | -3.5% / +31.1% | 94% | 94% | +| **Llama-3.3** — weight | 5 | -0.2% | +0.2% | -0.2% / -0.1% | 100% | 100% | +| **Llama-3.3** — KV | 5 | -2.2% | +10.9% | -32.7% / +5.9% | 0% | 80% | +| **Llama-4** — weight | 1 | -4.8% | +4.8% | -4.8% / -4.8% | 100% | 100% | +| **Llama-4** — KV | 1 | +36.2% | +36.2% | +36.2% / +36.2% | 0% | 0% | +| **Mistral-Small** — weight | 5 | -1.7% | +1.7% | -5.7% / -0.1% | 80% | 100% | +| **Mistral-Small** — KV | 5 | +4.5% | +4.5% | +1.2% / +7.5% | 40% | 100% | +| **Mixtral** — weight | 2 | -0.0% | +0.0% | -0.0% / -0.0% | 100% | 100% | +| **Mixtral** — KV | 2 | +0.3% | +2.2% | -1.9% / +2.4% | 100% | 100% | +| **Phi** — weight | 2 | -0.6% | +0.6% | -0.9% / -0.3% | 100% | 100% | +| **Phi** — KV | 2 | -2.3% | +4.3% | -6.6% / +2.0% | 50% | 100% | +| **Qwen2.5** — weight | 13 | -0.4% | +0.4% | -0.4% / 0.0% | 100% | 100% | +| **Qwen2.5** — KV | 13 | +3.1% | +8.8% | -4.2% / +60.9% | 85% | 92% | +| **Qwen3** — weight | 4 | -0.1% | +0.1% | -0.3% / -0.0% | 100% | 100% | +| **Qwen3** — KV | 4 | -5.8% | +10.8% | -28.7% / +5.4% | 50% | 75% | + +## TP sensitivity + +_KV cache error grouped by tensor-parallel degree (all models). After applying the per-GPU normalisation (÷TP×PP)._ + +| Cohort | N | Mean err | MAE | Min / Max | ≤5% | ≤10% | +|---|---|---|---|---|---|---| +| TP=1 | 34 | -4.3% | +6.4% | -32.7% / +31.1% | 76% | 88% | +| TP=2 | 15 | +10.5% | +10.7% | -1.9% / +61.9% | 60% | 87% | +| TP=4 | 15 | +7.3% | +7.3% | +2.4% / +36.2% | 47% | 93% | + +## PP sensitivity + +_KV cache error grouped by pipeline-parallel degree._ + +| Cohort | N | Mean err | MAE | Min / Max | ≤5% | ≤10% | +|---|---|---|---|---|---|---| +| PP=1 | 62 | +1.9% | +7.8% | -32.7% / +61.9% | 65% | 89% | +| PP=2 | 1 | -0.9% | +0.9% | -0.9% / -0.9% | 100% | 100% | +| PP=4 | 1 | +1.6% | +1.6% | +1.6% / +1.6% | 100% | 100% | + +## Context-length sensitivity (TP=1 runs only) + +_Models tested at multiple max_model_len values. KV cache error should be constant if the formula is context-length-agnostic._ + +**Qwen/Qwen2.5-7B-Instruct** + +| max_len | KV err | +|---|---| +| 2048 | -4.2% | +| 4096 | -4.2% | +| 8192 | -4.2% | +| 8192 | -4.2% | +| 8192 | -4.2% | +| 8192 | -4.2% | +| 16384 | -4.2% | +| 32768 | -4.2% | + +**meta-llama/Llama-3.1-8B-Instruct** + +| max_len | KV err | +|---|---| +| 2048 | -3.5% | +| 4096 | -3.5% | +| 8192 | -3.5% | +| 8192 | -3.5% | +| 8192 | -3.5% | +| 8192 | +31.1% | +| 8192 | -3.5% | +| 32768 | -3.5% | + +## Outliers (|error| > 10%) + +| Model | TP | PP | Weight err | KV err | Likely cause | +|---|---|---|---|---|---| +| moonshotai/Kimi-Dev-72B | 2 | 1 | -0.2% | +61.9% | TP/PP residual: per-GPU normalisation may be imprecise | +| Qwen/Qwen2.5-72B-Instruct | 2 | 1 | -0.1% | +60.9% | TP/PP residual: per-GPU normalisation may be imprecise | +| meta-llama/Llama-4-Scout-17B-16E-Instruct | 4 | 1 | -4.8% | +36.2% | TP/PP residual: per-GPU normalisation may be imprecise | +| redhatai/Llama-3.3-70B-Instruct-quantized.w8a8 | 1 | 1 | -0.1% | -32.7% | large model: activation constant may underestimate real overhead | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | -50.1% | +31.1% | KV formula overestimates available budget | +| Qwen/Qwen3-30B-A3B | 1 | 1 | -0.0% | -28.7% | MoE: routing overhead not modeled in activation/KV budget | +| deepseek-ai/DeepSeek-V2-Lite-Chat | 1 | 1 | -0.6% | -11.5% | unknown | +| openai/gpt-oss-20b | 4 | 1 | -11.8% | +5.5% | MoE/sparse model: shared expert / embedding memory not sharded by TP | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 4 | -12.2% | +1.6% | PP≥4: weight sharding formula incorrect for high PP | + +## Calibration notes + +### Weight memory + +- Mean error -1.8% — slightly negative (planner underestimates). Cause: safetensors metadata reports storage dtype; actual in-memory size can differ due to alignment/padding. + +- PP≥4 and certain MoE models show >10% weight error — embedding and shared-expert tensors may not be sharded by TP/PP as assumed by the formula. + +### KV cache memory (TP=1) + +- TP=1 KV mean error -4.6% (MAE +6.7%). Mostly within ±10%. + +- Consistent negative bias across TP=1 configs suggests activation_memory constant is slightly too high (over-reserves budget, leaving less for KV). + +### KV cache memory (TP>1) + +- After ÷(TP×PP) normalisation, errors are within ±10% for most models. + +- Remaining positive bias at TP=2/4 is consistent with extra NCCL/all-gather buffers not captured by non_torch constant. + +### Large-model KV outliers + +- `Qwen3-30B-A3B` (TP=1): −29%. MoE routing buffers consume more memory than modeled. + +- `Llama-3.3-70B-w8a8` (TP=1): −33%. W8A8 quantization increases activation-memory footprint (dequant workspace) not accounted for in constant. + +- `Kimi-Dev-72B` (TP=2): +62%. Likely residual normalisation issue or model-specific memory layout. + +- `Qwen2.5-72B` (TP=2): +61%. Same pattern as Kimi-Dev-72B — large model at TP=2 still shows excess after normalisation. + +## Per-model breakdown + +### Qwen/Qwen2.5-72B-Instruct _Qwen2.5 · Dense_ + +| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | +|---|---|---|---|---|---|---|---|---| +| 2 | 1 | 1 | 8192 | auto | — | auto | -0.1% | +60.9% | +| 4 | 1 | 1 | 8192 | auto | — | auto | -0.4% | +9.3% | + +### Qwen/Qwen2.5-7B-Instruct _Qwen2.5 · Dense_ + +| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | +|---|---|---|---|---|---|---|---|---| +| 1 | 1 | 1 | 2048 | auto | — | auto | -0.4% | -4.2% | +| 1 | 1 | 1 | 4096 | auto | — | auto | -0.4% | -4.2% | +| 1 | 1 | 1 | 8192 | bfloat16 | — | fp8 | -0.4% | -4.2% | +| 1 | 1 | 1 | 8192 | bfloat16 | — | auto | -0.4% | -4.2% | +| 1 | 1 | 1 | 8192 | float16 | — | auto | -0.4% | -4.2% | +| 1 | 1 | 1 | 8192 | auto | — | auto | -0.4% | -4.2% | +| 1 | 1 | 1 | 16384 | auto | — | auto | -0.4% | -4.2% | +| 1 | 1 | 1 | 32768 | auto | — | auto | -0.4% | -4.2% | +| 2 | 1 | 1 | 8192 | auto | — | auto | -0.4% | +2.6% | +| 4 | 1 | 1 | 8192 | auto | — | auto | 0.0% | +4.6% | + +### Qwen/Qwen3-30B-A3B _Qwen3 · MoE_ + +| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | +|---|---|---|---|---|---|---|---|---| +| 1 | 1 | 1 | 8192 | auto | — | auto | -0.0% | -28.7% | +| 4 | 1 | 1 | 8192 | auto | — | auto | -0.2% | +5.4% | + +### Qwen/Qwen3-8B _Qwen3 · Dense_ + +| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | +|---|---|---|---|---|---|---|---|---| +| 1 | 1 | 1 | 8192 | auto | — | auto | -0.1% | -4.4% | +| 4 | 1 | 1 | 8192 | auto | — | auto | -0.3% | +4.7% | + +### RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic _Llama-3.3 · Dense_ + +| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | +|---|---|---|---|---|---|---|---|---| +| 2 | 1 | 1 | 8192 | auto | — | auto | -0.1% | +5.0% | +| 4 | 1 | 1 | 8192 | auto | — | auto | -0.2% | +5.9% | + +### RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16 _Llama-3.1 · Dense_ + +| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | +|---|---|---|---|---|---|---|---|---| +| 1 | 1 | 1 | 8192 | float16 | — | auto | -0.7% | -3.0% | + +### RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8 _Llama-3.1 · Dense_ + +| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | +|---|---|---|---|---|---|---|---|---| +| 1 | 1 | 1 | 8192 | float16 | — | auto | -0.4% | -3.1% | +| 1 | 1 | 1 | 8192 | float16 | — | auto | -0.4% | -3.1% | +| 1 | 1 | 1 | 8192 | float16 | — | auto | -0.4% | -3.1% | + +### RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8 _Mistral-Small · Dense_ + +| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | +|---|---|---|---|---|---|---|---|---| +| 1 | 1 | 1 | 8192 | auto | — | auto | -0.2% | +1.2% | +| 2 | 1 | 1 | 8192 | auto | — | auto | -0.8% | +5.3% | + +### RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8 _Qwen2.5 · Dense_ + +| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | +|---|---|---|---|---|---|---|---|---| +| 1 | 1 | 1 | 8192 | auto | — | auto | -0.4% | -3.9% | + +### deepseek-ai/DeepSeek-V2-Lite-Chat _DeepSeek · MoE_ + +| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | +|---|---|---|---|---|---|---|---|---| +| 1 | 1 | 1 | 8192 | auto | — | auto | -0.6% | -11.5% | +| 2 | 1 | 1 | 8192 | auto | — | auto | -1.3% | +0.6% | +| 4 | 1 | 1 | 8192 | auto | — | auto | -2.7% | +3.7% | + +### ibm-granite/granite-3.1-2b-instruct _Granite · Dense_ + +| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | +|---|---|---|---|---|---|---|---|---| +| 1 | 1 | 1 | 8192 | auto | — | auto | -0.4% | -5.3% | +| 2 | 1 | 1 | 8192 | auto | — | auto | -0.8% | +0.4% | + +### ibm-granite/granite-3.1-8b-instruct _Granite · Dense_ + +| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | +|---|---|---|---|---|---|---|---|---| +| 4 | 1 | 1 | 8192 | auto | — | auto | -1.8% | +2.7% | + +### ibm-granite/granite-3.3-8b-instruct _Granite · Dense_ + +| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | +|---|---|---|---|---|---|---|---|---| +| 1 | 1 | 1 | 8192 | auto | — | auto | -0.2% | -6.0% | +| 2 | 1 | 1 | 8192 | auto | — | auto | -0.4% | +0.6% | +| 4 | 1 | 1 | 8192 | auto | — | auto | -1.8% | +2.7% | + +### ibm-granite/granite-vision-3.3-2b _Granite-Vision · Multimodal_ + +| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | +|---|---|---|---|---|---|---|---|---| +| 1 | 1 | 1 | 8192 | auto | — | auto | 0.0% | -1.2% | +| 2 | 1 | 1 | 8192 | auto | — | auto | -0.7% | +2.6% | + +### meta-llama/Llama-3.1-8B-Instruct _Llama-3.1 · Dense_ + +| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | +|---|---|---|---|---|---|---|---|---| +| 1 | 1 | 1 | 2048 | auto | — | auto | -0.2% | -3.5% | +| 1 | 1 | 1 | 4096 | auto | — | auto | -0.2% | -3.5% | +| 1 | 1 | 1 | 8192 | bfloat16 | — | fp8 | -0.2% | -3.5% | +| 1 | 1 | 1 | 8192 | bfloat16 | — | auto | -0.2% | -3.5% | +| 1 | 1 | 1 | 8192 | float16 | — | auto | -0.2% | -3.5% | +| 1 | 1 | 1 | 8192 | float32 | — | auto | -50.1% | +31.1% | +| 1 | 1 | 1 | 8192 | auto | — | auto | -0.2% | -3.5% | +| 1 | 1 | 1 | 32768 | auto | — | auto | -0.2% | -3.5% | +| 1 | 2 | 1 | 8192 | auto | — | auto | -0.4% | -0.9% | +| 1 | 4 | 1 | 8192 | auto | — | auto | -12.2% | +1.6% | +| 2 | 1 | 1 | 8192 | auto | — | auto | -0.4% | +2.8% | +| 4 | 1 | 1 | 8192 | auto | — | auto | -0.8% | +4.5% | + +### meta-llama/Llama-4-Scout-17B-16E-Instruct _Llama-4 · MoE_ + +| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | +|---|---|---|---|---|---|---|---|---| +| 4 | 1 | 1 | 8192 | auto | — | auto | -4.8% | +36.2% | + +### microsoft/phi-4 _Phi · Dense_ + +| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | +|---|---|---|---|---|---|---|---|---| +| 1 | 1 | 1 | 8192 | auto | — | auto | -0.3% | -6.6% | +| 2 | 1 | 1 | 8192 | auto | — | auto | -0.9% | +2.0% | + +### mistralai/Mistral-Small-3.1-24B-Instruct-2503 _Mistral-Small · Dense_ + +| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | +|---|---|---|---|---|---|---|---|---| +| 1 | 1 | 1 | 8192 | auto | — | auto | -0.1% | +1.6% | +| 2 | 1 | 1 | 8192 | auto | — | auto | -1.9% | +7.2% | +| 4 | 1 | 1 | 8192 | auto | — | auto | -5.7% | +7.5% | + +### mistralai/Mixtral-8x7B-Instruct-v0.1 _Mixtral · MoE_ + +| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | +|---|---|---|---|---|---|---|---|---| +| 2 | 1 | 1 | 8192 | auto | — | auto | -0.0% | -1.9% | +| 4 | 1 | 1 | 8192 | auto | — | auto | -0.0% | +2.4% | + +### moonshotai/Kimi-Dev-72B _Kimi · MoE_ + +| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | +|---|---|---|---|---|---|---|---|---| +| 2 | 1 | 1 | 8192 | auto | — | auto | -0.2% | +61.9% | +| 4 | 1 | 1 | 8192 | auto | — | auto | -0.4% | +9.3% | + +### moonshotai/Kimi-VL-A3B-Instruct _Kimi-VL · Multimodal_ + +| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | +|---|---|---|---|---|---|---|---|---| +| 1 | 1 | 1 | 8192 | auto | — | auto | -0.6% | -9.8% | +| 2 | 1 | 1 | 8192 | auto | — | auto | -1.6% | +2.4% | + +### openai/gpt-oss-20b _GPT-OSS (openai) · MoE_ + +| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | +|---|---|---|---|---|---|---|---|---| +| 4 | 1 | 1 | 8192 | auto | — | auto | -11.8% | +5.5% | + +### redhatai/Llama-3.3-70B-Instruct-quantized.w8a8 _Llama-3.3 · Dense_ + +| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | +|---|---|---|---|---|---|---|---|---| +| 1 | 1 | 1 | 8192 | auto | — | auto | -0.1% | -32.7% | +| 2 | 1 | 1 | 8192 | auto | — | auto | -0.1% | +5.0% | +| 4 | 1 | 1 | 8192 | auto | — | auto | -0.2% | +5.9% | diff --git a/accuracy/results/v0.19.0/parameter_sensitivity.md b/accuracy/results/v0.19.0/parameter_sensitivity.md new file mode 100644 index 00000000..1f33cd09 --- /dev/null +++ b/accuracy/results/v0.19.0/parameter_sensitivity.md @@ -0,0 +1,217 @@ +# Parameter Sensitivity Analysis — Capacity Planner vs vLLM + +_vLLM v0.19.0 · H100-80GB · Llama-3.1-8B and Qwen2.5-7B as reference models_ + +--- + +## Summary table + +| Parameter | Affects weight prediction? | Affects KV-GiB prediction? | Affects token-capacity prediction? | Planner handles it? | +|---|---|---|---|---| +| `--dtype` (bf16 / fp16 / auto) | No | No | No | ✅ N/A | +| `--dtype float32` | **Yes — 2× weight** | **Yes — 2× KV/token** | **Yes — 2×** | ❌ Gap (measured: −50% weight, +31% KV) | +| `--kv-cache-dtype` (auto / fp8) | No | No | **Yes — 2×** | ❌ Gap | +| Weight quantization (w8a8 / w4a16) — small model | ✅ Yes, correctly | ✅ Yes, correctly | No | ✅ Yes | +| Weight quantization (w8a8) — large model (70B) | ✅ Yes, correctly | ❌ Over-reserves | No | ⚠️ Partial | + +--- + +## `--dtype` (bf16 / fp16 / auto / **fp32**) + +**Tested: bf16, fp16, auto — no effect. fp32 — measured gap confirmed.** + +### Tested values (bf16 / fp16 / auto) + +Both bf16 and fp16 are 2-byte formats. For all models tested, switching `--dtype` among +`auto`, `bfloat16`, and `float16` produced **identical** weight and KV memory measurements. + +| Model | dtype | Weight (GiB) | KV avail (GiB) | KV tokens | +|---|---|---|---|---| +| Llama-3.1-8B | auto | 14.99 | 58.11 | 476 000 | +| Llama-3.1-8B | bfloat16 | 14.99 | 58.11 | 476 000 | +| Llama-3.1-8B | float16 | 14.99 | 58.11 | 476 016 | +| Qwen2.5-7B | auto | 14.25 | 58.53 | 1 096 000 | +| Qwen2.5-7B | bfloat16 | 14.25 | 58.53 | 1 096 000 | +| Qwen2.5-7B | float16 | 14.25 | 58.53 | 1 096 000 | + +The planner reads the storage dtype from safetensors metadata and handles these correctly. +For almost all production models the safetensors dtype is bf16, and `auto` / `bfloat16` / +`float16` all resolve to the same 2 bytes/element in memory. + +### `--dtype float32` — measured gap + +**Root cause**: `--dtype` is not a parameter the planner exposes. Neither +`per_gpu_model_memory_required()` nor `allocatable_kv_cache_memory()` accept a dtype +argument, so there is no way to pass a runtime dtype override to the planner. Internally +it reads: +- Weight bytes from safetensors storage dtype (`model_params_by_dtype`) → bf16 = 2 bytes +- KV element size from `model_config.torch_dtype` or `inference_dtype()` → also bf16 + +With `--dtype float32`, vLLM upcasts every weight tensor to fp32 (4 bytes) in GPU memory +and also stores KV cache elements in fp32. The planner has no way to see or account for +this at all. + +**Measured prediction errors — Llama-3.1-8B, TP=1, H100-80GB:** + +| Component | Planner predicts (bf16 storage) | vLLM measured (fp32 runtime) | Error | +|---|---|---|---| +| Weight memory | ~14.96 GiB | **29.98 GiB** | **−50.1%** | +| KV avail (GiB) | ~56.1 GiB (inflated — sees extra 15 GiB) | **42.80 GiB** | **+31.1%** | +| KV tokens | ~460 000 | **175 296** | ~+163% | +| KV block bytes | ~2.1 MB | ~4.2 MB | ~+100% (fp32 per-element) | + +Because KV available GiB is computed as `GPU_budget − weight − activation − overhead`, +and weight is underestimated by 15 GiB, the planner thinks there is 15 GiB more KV room +than actually exists. A model the planner declares fits on one H100 **may OOM at runtime**. + +**Fix required** in `per_gpu_model_memory_required()` / `KVCacheDetail.__init__()` +(in `capacity_planner.py`): accept a `dtype_override` argument. When `--dtype float32` +is requested, multiply all 2-byte storage costs by 2 before computing memory budgets. + +```python +# Pseudocode for the fix +if vllm_args.get("dtype") == "float32": + weight_bytes_per_param *= 2 # upcast from storage bf16 → fp32 + kv_bytes_per_element *= 2 # KV also stored in fp32 +``` + +**When does `--dtype float32` matter in practice?** +Rarely in production (bf16/fp16 is standard for inference). It appears in: +- Debugging runs on GPUs without bf16 support (e.g., some older V100 configs) +- Research runs requiring higher numerical precision +- CPU-only inference (not GPU-relevant here) + +Recommendation: add a validation warning in the planner if `--dtype float32` is requested, +since capacity estimates will be unreliable until the fix is implemented. + +--- + +## `--kv-cache-dtype` (auto / fp8) + +**Conclusion: does NOT change allocatable KV GiB, but doubles token capacity. The planner +correctly predicts KV GiB, but does not model the token-count implication of fp8.** + +When `--kv-cache-dtype=fp8`, vLLM stores each KV element in 1 byte instead of 2. +The GPU allocates the **same number of bytes** for KV regardless — but twice as many tokens +fit within that budget. + +| Model | kv_cache_dtype | Weight (GiB) | KV avail (GiB) | KV tokens | Bytes/token | +|---|---|---|---|---|---| +| Llama-3.1-8B | auto (bf16) | 14.99 | 58.11 | 476 000 | 2 097 315 / block | +| Llama-3.1-8B | fp8 | 14.99 | **58.11** | **952 032** | 1 048 622 / block | +| Qwen2.5-7B | auto (bf16) | 14.25 | 58.53 | 1 096 000 | 917 461 / block | +| Qwen2.5-7B | fp8 | 14.25 | **58.53** | **2 192 000** | 458 730 / block | + +Observations: +- KV GiB is **identical** — the allocatable memory budget is dtype-agnostic. +- KV token count **doubles** — fp8 halves bytes-per-element. +- Block size (bytes) **halves** — confirms fp8 is applied at the element level. + +**Planner accuracy for KV GiB**: unaffected by this flag. Error for both runs is +identical (e.g. −3.5% for Llama-3.1-8B) because the planner computes in GiB. + +**Gap**: the planner does not expose a token-count or max-concurrency estimate. +Any downstream code that converts predicted KV GiB → token count must apply: + +``` +kv_tokens = kv_cache_gib × GiB_in_bytes / per_token_bytes(kv_cache_dtype) +``` + +where `per_token_bytes` is 2 for `auto`/bf16/fp16 and 1 for `fp8`. + +--- + +## Weight quantization (`--quantization` / model-embedded) + +### Small quantized models (Llama-3.1-8B w8a8 and w4a16) + +**Conclusion: weight and KV predictions are accurate. The planner correctly reads +quantization config from the HuggingFace model and adjusts bytes-per-parameter.** + +| Model | Quant | Weight measured | Weight predicted | Weight err | KV measured | KV predicted | KV err | +|---|---|---|---|---|---|---|---| +| Llama-3.1-8B | fp16 (baseline) | 14.99 GiB | ~14.96 GiB | −0.2% | 58.11 GiB | ~56.09 GiB | −3.5% | +| Llama-3.1-8B | w8a8 | 8.49 GiB | ~8.46 GiB | −0.4% | 64.60 GiB | ~62.59 GiB | −3.1% | +| Llama-3.1-8B | w4a16 | 5.38 GiB | ~5.34 GiB | −0.7% | 67.71 GiB | ~65.69 GiB | −3.0% | + +Key observations: +- Weight reduction is correctly modeled: w8a8 saves ~6.5 GiB, w4a16 saves ~9.6 GiB. +- KV budget expands proportionally as expected (more room once weights shrink). +- The ~3% KV under-prediction is consistent with the unquantized baseline, indicating + it comes from the activation constant overestimate (see below), not quantization handling. +- `quantization: null` in vllm_args is expected for these models — the quantization is + embedded in the model weights; vLLM detects it automatically via the model config. + +### Large quantized models (Llama-3.3-70B w8a8) + +**Conclusion: weight is predicted correctly, but KV is significantly under-predicted (−32.7% +at TP=1). Root cause: the activation constant (5.5 GiB) was calibrated on fp16 models. +W8A8 reduces activation memory because intermediate tensors are int8.** + +| TP | Weight measured | KV measured | KV err | Derived activation | +|---|---|---|---|---| +| TP=1 | 67.72 GiB | 5.01 GiB | −32.7% | ~3.1 GiB | +| TP=2 | 33.88 GiB/GPU | 37.28 GiB/GPU | +5.0% | (residual absorbed) | +| TP=4 | 16.96 GiB/GPU | 54.08 GiB/GPU | +5.9% | (residual absorbed) | + +Deriving actual activation memory at TP=1: + +``` +available = 80 GiB × 0.95 = 76.0 GiB +consumed = weight + cuda_graph + non_torch + kv + = 67.72 + 0.84 + 0.15 + 5.01 = 73.72 GiB +residual = 76.0 - 73.72 = 2.28 GiB → actual activation ≈ 2.3 GiB +``` + +vs planner constant = **5.5 GiB** → overestimates by ~3.2 GiB → predicts 3.2 GiB less KV. + +This explains the −32.7% error on a model with only ~5 GiB of usable KV: +3.2 GiB over-reservation on a 5 GiB budget = ~64% effective error. Expressed as the +measured 32.7% figure: `(predicted − measured) / measured = (1.81 − 5.01) / 5.01 = −63.8%`. + +**Why does w8a8 reduce activation memory?** +With int8 activations, intermediate tensors (attention scores, MLP buffers) are 1 byte/element +instead of 2. For a 70B model, this approximately halves the activation footprint. For small +models (8B), the same effect is present but the absolute magnitude (~1 GiB) is small relative +to the large KV budget (~64 GiB), so it's only a −3% KV error. + +**Why does the error disappear at TP≥2?** +At TP=2 and TP=4, the weight per GPU drops significantly (33.9 / 17.0 GiB), leaving much +more room for KV. A 3 GiB activation over-reservation becomes a small fraction of the +large available KV budget, so the relative error is within ±6%. + +--- + +## Implications for planner calibration + +1. **`--dtype`: no action needed.** The planner reads the actual safetensors dtype and + handles bf16, fp16, and auto identically and correctly. + +2. **`--kv-cache-dtype`: add token-count conversion.** KV GiB prediction is correct. + Expose a downstream conversion: `kv_tokens = kv_gib × GiB / per_element_bytes(kv_dtype)`. + The planner's `KVCacheDetail` already has `kv_data_type` — this can be used to compute + the per-token cost and hence max concurrency. + +3. **Weight quantization on small models: no action needed.** The planner correctly reads + quantization config and applies the right bytes-per-parameter. + +4. **W8A8 activation constant on large models: needs a separate constant.** + Introduce `ACTIVATION_MEMORY_BASE_DENSE_W8A8_GIB ≈ 2.3` for large w8a8 models, or + scale the existing constant by dtype precision: `activation × (quant_bytes / 2.0)`. + Evidence: 70B w8a8 shows ~2.3 GiB actual vs 5.5 GiB assumed; 8B w8a8 shows ~1.5 GiB. + A simple heuristic: `activation_for_quantized = activation_fp16 × 0.45`. + +--- + +## What does NOT affect predictions + +- **`max_model_len`**: KV error is identical across 2K–32K for both Llama-3.1-8B and + Qwen2.5-7B at TP=1. The formula is correctly context-length-agnostic (allocates by bytes, + not by tokens). + +- **`gpu_memory_utilization`**: scaling the utilization factor is a linear multiplier in the + formula; no systematic error expected as long as the value matches what's passed to vLLM. + +- **TP/PP degree** (after normalisation): the ÷(TP×PP) correction brings most multi-GPU + KV errors within ±10%, with residual positive bias coming from NCCL buffer overhead + not captured in the `non_torch` constant. diff --git a/accuracy/results/v0.19.0/report.md b/accuracy/results/v0.19.0/report.md new file mode 100644 index 00000000..772e92c1 --- /dev/null +++ b/accuracy/results/v0.19.0/report.md @@ -0,0 +1,142 @@ +# Memory Validation Report + +## Per-component error + +| Model | TP | PP | DP | max_len | Weight | Activation | Non-torch | KV cache | +|---|---|---|---|---|---|---|---|---| +| deepseek-ai/DeepSeek-V2-Lite-Chat | 1 | 1 | 1 | 8192 | -0.6% | — | — | -11.5% | +| deepseek-ai/DeepSeek-V2-Lite-Chat | 2 | 1 | 1 | 8192 | -1.3% | — | — | +0.6% | +| deepseek-ai/DeepSeek-V2-Lite-Chat | 4 | 1 | 1 | 8192 | -2.7% | — | — | +3.7% | +| RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic | 2 | 1 | 1 | 8192 | -0.1% | — | — | +5.0% | +| RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic | 4 | 1 | 1 | 8192 | -0.2% | — | — | +5.9% | +| ibm-granite/granite-3.1-2b-instruct | 1 | 1 | 1 | 8192 | -0.4% | — | — | -5.3% | +| ibm-granite/granite-3.1-2b-instruct | 2 | 1 | 1 | 8192 | -0.8% | — | — | +0.4% | +| ibm-granite/granite-3.1-8b-instruct | 4 | 1 | 1 | 8192 | -1.8% | — | — | +2.7% | +| ibm-granite/granite-3.3-8b-instruct | 1 | 1 | 1 | 8192 | -0.2% | — | — | -6.0% | +| ibm-granite/granite-3.3-8b-instruct | 2 | 1 | 1 | 8192 | -0.4% | — | — | +0.6% | +| ibm-granite/granite-3.3-8b-instruct | 4 | 1 | 1 | 8192 | -1.8% | — | — | +2.7% | +| ibm-granite/granite-vision-3.3-2b | 1 | 1 | 1 | 8192 | 0.0% | — | — | -1.2% | +| ibm-granite/granite-vision-3.3-2b | 2 | 1 | 1 | 8192 | -0.7% | — | — | +2.6% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 8192 | -0.2% | — | — | -3.5% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 8192 | -0.2% | — | — | -3.5% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 8192 | -0.2% | — | — | -3.5% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 8192 | -50.1% | — | — | +31.1% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 2048 | -0.2% | — | — | -3.5% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 4096 | -0.2% | — | — | -3.5% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 8192 | -0.2% | — | — | -3.5% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 2 | 1 | 8192 | -0.4% | — | — | -0.9% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 4 | 1 | 8192 | -12.2% | — | — | +1.6% | +| meta-llama/Llama-3.1-8B-Instruct | 2 | 1 | 1 | 8192 | -0.4% | — | — | +2.8% | +| meta-llama/Llama-3.1-8B-Instruct | 4 | 1 | 1 | 8192 | -0.8% | — | — | +4.5% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 32768 | -0.2% | — | — | -3.5% | +| meta-llama/Llama-4-Scout-17B-16E-Instruct | 4 | 1 | 1 | 8192 | -4.8% | — | — | +36.2% | +| microsoft/phi-4 | 1 | 1 | 1 | 8192 | -0.3% | — | — | -6.6% | +| microsoft/phi-4 | 2 | 1 | 1 | 8192 | -0.9% | — | — | +2.0% | +| mistralai/Mistral-Small-3.1-24B-Instruct-2503 | 1 | 1 | 1 | 8192 | -0.1% | — | — | +1.6% | +| mistralai/Mistral-Small-3.1-24B-Instruct-2503 | 2 | 1 | 1 | 8192 | -1.9% | — | — | +7.2% | +| mistralai/Mistral-Small-3.1-24B-Instruct-2503 | 4 | 1 | 1 | 8192 | -5.7% | — | — | +7.5% | +| mistralai/Mixtral-8x7B-Instruct-v0.1 | 2 | 1 | 1 | 8192 | -0.0% | — | — | -1.9% | +| mistralai/Mixtral-8x7B-Instruct-v0.1 | 4 | 1 | 1 | 8192 | -0.0% | — | — | +2.4% | +| moonshotai/Kimi-Dev-72B | 2 | 1 | 1 | 8192 | -0.2% | — | — | +61.9% | +| moonshotai/Kimi-Dev-72B | 4 | 1 | 1 | 8192 | -0.4% | — | — | +9.3% | +| moonshotai/Kimi-VL-A3B-Instruct | 1 | 1 | 1 | 8192 | -0.6% | — | — | -9.8% | +| moonshotai/Kimi-VL-A3B-Instruct | 2 | 1 | 1 | 8192 | -1.6% | — | — | +2.4% | +| openai/gpt-oss-20b | 4 | 1 | 1 | 8192 | -11.8% | — | — | +5.5% | +| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 8192 | -0.4% | — | — | -4.2% | +| Qwen/Qwen2.5-72B-Instruct | 2 | 1 | 1 | 8192 | -0.1% | — | — | +60.9% | +| Qwen/Qwen2.5-72B-Instruct | 4 | 1 | 1 | 8192 | -0.4% | — | — | +9.3% | +| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 8192 | -0.4% | — | — | -4.2% | +| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 8192 | -0.4% | — | — | -4.2% | +| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 16384 | -0.4% | — | — | -4.2% | +| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 32768 | -0.4% | — | — | -4.2% | +| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 2048 | -0.4% | — | — | -4.2% | +| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 4096 | -0.4% | — | — | -4.2% | +| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 8192 | -0.4% | — | — | -4.2% | +| Qwen/Qwen2.5-7B-Instruct | 2 | 1 | 1 | 8192 | -0.4% | — | — | +2.6% | +| Qwen/Qwen2.5-7B-Instruct | 4 | 1 | 1 | 8192 | 0.0% | — | — | +4.6% | +| Qwen/Qwen3-30B-A3B | 1 | 1 | 1 | 8192 | -0.0% | — | — | -28.7% | +| Qwen/Qwen3-30B-A3B | 4 | 1 | 1 | 8192 | -0.2% | — | — | +5.4% | +| Qwen/Qwen3-8B | 1 | 1 | 1 | 8192 | -0.1% | — | — | -4.4% | +| Qwen/Qwen3-8B | 4 | 1 | 1 | 8192 | -0.3% | — | — | +4.7% | +| RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8 | 1 | 1 | 1 | 8192 | -0.4% | — | — | -3.1% | +| redhatai/Llama-3.3-70B-Instruct-quantized.w8a8 | 1 | 1 | 1 | 8192 | -0.1% | — | — | -32.7% | +| redhatai/Llama-3.3-70B-Instruct-quantized.w8a8 | 2 | 1 | 1 | 8192 | -0.1% | — | — | +5.0% | +| redhatai/Llama-3.3-70B-Instruct-quantized.w8a8 | 4 | 1 | 1 | 8192 | -0.2% | — | — | +5.9% | +| RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8 | 1 | 1 | 1 | 8192 | -0.4% | — | — | -3.1% | +| RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16 | 1 | 1 | 1 | 8192 | -0.7% | — | — | -3.0% | +| RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8 | 1 | 1 | 1 | 8192 | -0.2% | — | — | +1.2% | +| RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8 | 2 | 1 | 1 | 8192 | -0.8% | — | — | +5.3% | +| RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8 | 1 | 1 | 1 | 8192 | -0.4% | — | — | -3.1% | +| RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8 | 1 | 1 | 1 | 8192 | -0.4% | — | — | -3.9% | + +## Per-architecture error + +_Group by architecture class. Mean and max absolute error per component._ + +## Argument sensitivity + +### max_model_len sweep + +| Value | Weight | Activation | Non-torch | KV cache | +|---|---|---|---|---| +| 2048 | -0.2% | — | — | -3.5% | +| 4096 | -0.2% | — | — | -3.5% | +| 32768 | -0.2% | — | — | -3.5% | +| 16384 | -0.4% | — | — | -4.2% | +| 32768 | -0.4% | — | — | -4.2% | +| 2048 | -0.4% | — | — | -4.2% | +| 4096 | -0.4% | — | — | -4.2% | + +### pp sweep + +| Value | Weight | Activation | Non-torch | KV cache | +|---|---|---|---|---| +| 2 | -0.4% | — | — | -0.9% | +| 4 | -12.2% | — | — | +1.6% | + +### dtype sweep + +| Value | Weight | Activation | Non-torch | KV cache | +|---|---|---|---|---| +| bfloat16 | -0.2% | — | — | -3.5% | +| float32 | -50.1% | — | — | +31.1% | + +### quantization sweep + +| Value | Weight | Activation | Non-torch | KV cache | +|---|---|---|---|---| +| None | -0.1% | — | — | +5.0% | +| None | -0.2% | — | — | +5.9% | +| None | -0.4% | — | — | -3.1% | +| None | -0.4% | — | — | -3.1% | +| None | -0.7% | — | — | -3.0% | +| None | -0.2% | — | — | +1.2% | +| None | -0.8% | — | — | +5.3% | +| None | -0.4% | — | — | -3.1% | +| None | -0.4% | — | — | -3.9% | + +### kv_cache_dtype sweep + +| Value | Weight | Activation | Non-torch | KV cache | +|---|---|---|---|---| +| fp8 | -0.2% | — | — | -3.5% | +| auto | -0.2% | — | — | -3.5% | +| fp8 | -0.4% | — | — | -4.2% | +| auto | -0.4% | — | — | -4.2% | +| auto | -0.4% | — | — | -4.2% | + +## Outliers + +- **deepseek-ai/DeepSeek-V2-Lite-Chat** (TP=1): {'kv_cache': -11.511121302453557} — root cause required +- **meta-llama/Llama-3.1-8B-Instruct** (TP=1): {'weight_memory': -50.100066711140755, 'kv_cache': 31.051401869158894} — root cause required +- **meta-llama/Llama-3.1-8B-Instruct** (TP=1): {'weight_memory': -12.206572769953041} — root cause required +- **meta-llama/Llama-4-Scout-17B-16E-Instruct** (TP=4): {'kv_cache': 36.179104477611936} — root cause required +- **moonshotai/Kimi-Dev-72B** (TP=2): {'kv_cache': 61.920529801324484} — root cause required +- **openai/gpt-oss-20b** (TP=4): {'weight_memory': -11.845730027548202} — root cause required +- **Qwen/Qwen2.5-72B-Instruct** (TP=2): {'kv_cache': 60.855263157894726} — root cause required +- **Qwen/Qwen3-30B-A3B** (TP=1): {'kv_cache': -28.747566515249833} — root cause required +- **redhatai/Llama-3.3-70B-Instruct-quantized.w8a8** (TP=1): {'kv_cache': -32.73453093812375} — root cause required + +## Calibration decisions + +_Document constant changes here: old value → new value, evidence._ diff --git a/accuracy/results/v0.19.0/results.csv b/accuracy/results/v0.19.0/results.csv new file mode 100644 index 00000000..3eac3304 --- /dev/null +++ b/accuracy/results/v0.19.0/results.csv @@ -0,0 +1,65 @@ +model,gpu,tp,pp,dp,max_model_len,dtype,quantization,kv_cache_dtype,weight_error_pct,activation_error_pct,non_torch_error_pct,kv_cache_error_pct +deepseek-ai/DeepSeek-V2-Lite-Chat,H100-80GB,1,1,1,8192,auto,,auto,-0.5776418620455255,,,-11.511121302453557 +deepseek-ai/DeepSeek-V2-Lite-Chat,H100-80GB,2,1,1,8192,auto,,auto,-1.3486176668914316,,,0.5668733392382644 +deepseek-ai/DeepSeek-V2-Lite-Chat,H100-80GB,4,1,1,8192,auto,,auto,-2.6631158455392834,,,3.703122548250442 +RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic,H100-80GB,2,1,1,8192,auto,,auto,-0.11806375442738827,,,5.042918454935609 +RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic,H100-80GB,4,1,1,8192,auto,,auto,-0.23584905660376854,,,5.897578110556475 +ibm-granite/granite-3.1-2b-instruct,H100-80GB,1,1,1,8192,auto,,auto,-0.421940928270052,,,-5.268475750577376 +ibm-granite/granite-3.1-2b-instruct,H100-80GB,2,1,1,8192,auto,,auto,-0.8403361344537823,,,0.44298370963132644 +ibm-granite/granite-3.1-8b-instruct,H100-80GB,4,1,1,8192,auto,,auto,-1.8087855297157693,,,2.6758297996783127 +ibm-granite/granite-3.3-8b-instruct,H100-80GB,1,1,1,8192,auto,,auto,-0.19672131147540564,,,-6.017729287419015 +ibm-granite/granite-3.3-8b-instruct,H100-80GB,2,1,1,8192,auto,,auto,-0.39267015706805447,,,0.5876894525208934 +ibm-granite/granite-3.3-8b-instruct,H100-80GB,4,1,1,8192,auto,,auto,-1.8087855297157693,,,2.6758297996783127 +ibm-granite/granite-vision-3.3-2b,H100-80GB,1,1,1,8192,auto,,auto,0.0,,,-1.2235979606700704 +ibm-granite/granite-vision-3.3-2b,H100-80GB,2,1,1,8192,auto,,auto,-0.7168458781362014,,,2.6164462334675003 +meta-llama/Llama-3.1-8B-Instruct,H100-80GB,1,1,1,8192,bfloat16,,fp8,-0.20013342228151673,,,-3.476165892273268 +meta-llama/Llama-3.1-8B-Instruct,H100-80GB,1,1,1,8192,bfloat16,,auto,-0.20013342228151673,,,-3.476165892273268 +meta-llama/Llama-3.1-8B-Instruct,H100-80GB,1,1,1,8192,float16,,auto,-0.20013342228151673,,,-3.476165892273268 +meta-llama/Llama-3.1-8B-Instruct,H100-80GB,1,1,1,8192,float32,,auto,-50.100066711140755,,,31.051401869158894 +meta-llama/Llama-3.1-8B-Instruct,H100-80GB,1,1,1,2048,auto,,auto,-0.20013342228151673,,,-3.476165892273268 +meta-llama/Llama-3.1-8B-Instruct,H100-80GB,1,1,1,4096,auto,,auto,-0.20013342228151673,,,-3.476165892273268 +meta-llama/Llama-3.1-8B-Instruct,H100-80GB,1,1,1,8192,auto,,auto,-0.20013342228151673,,,-3.476165892273268 +meta-llama/Llama-3.1-8B-Instruct,H100-80GB,1,2,1,8192,auto,,auto,-0.3994673768308836,,,-0.8566275924256197 +meta-llama/Llama-3.1-8B-Instruct,H100-80GB,1,4,1,8192,auto,,auto,-12.206572769953041,,,1.59025787965616 +meta-llama/Llama-3.1-8B-Instruct,H100-80GB,2,1,1,8192,auto,,auto,-0.3994673768308836,,,2.7603513174403984 +meta-llama/Llama-3.1-8B-Instruct,H100-80GB,4,1,1,8192,auto,,auto,-0.795755968169756,,,4.478054567022532 +meta-llama/Llama-3.1-8B-Instruct,H100-80GB,1,1,1,32768,auto,,auto,-0.20013342228151673,,,-3.476165892273268 +meta-llama/Llama-4-Scout-17B-16E-Instruct,H100-80GB,4,1,1,8192,auto,,auto,-4.762801204819266,,,36.179104477611936 +microsoft/phi-4,H100-80GB,1,1,1,8192,auto,,auto,-0.29207740051114217,,,-6.597222222222221 +microsoft/phi-4,H100-80GB,2,1,1,8192,auto,,auto,-0.8714596949891011,,,1.9526524969759853 +mistralai/Mistral-Small-3.1-24B-Instruct-2503,H100-80GB,1,1,1,8192,auto,,auto,-0.08936550491510087,,,1.5608371763036457 +mistralai/Mistral-Small-3.1-24B-Instruct-2503,H100-80GB,2,1,1,8192,auto,,auto,-1.9298245614035143,,,7.159114421684257 +mistralai/Mistral-Small-3.1-24B-Instruct-2503,H100-80GB,4,1,1,8192,auto,,auto,-5.733558178752106,,,7.451841838458945 +mistralai/Mixtral-8x7B-Instruct-v0.1,H100-80GB,2,1,1,8192,auto,,auto,-0.022983222247754564,,,-1.8987341772151993 +mistralai/Mixtral-8x7B-Instruct-v0.1,H100-80GB,4,1,1,8192,auto,,auto,-0.04595588235294836,,,2.4191949236565513 +moonshotai/Kimi-Dev-72B,H100-80GB,2,1,1,8192,auto,,auto,-0.16219404305514515,,,61.920529801324484 +moonshotai/Kimi-Dev-72B,H100-80GB,4,1,1,8192,auto,,auto,-0.44104675095559714,,,9.313725490196083 +moonshotai/Kimi-VL-A3B-Instruct,H100-80GB,1,1,1,8192,auto,,auto,-0.5855562784645404,,,-9.753146176185869 +moonshotai/Kimi-VL-A3B-Instruct,H100-80GB,2,1,1,8192,auto,,auto,-1.6097875080489377,,,2.4461482292807526 +openai/gpt-oss-20b,H100-80GB,4,1,1,8192,auto,,auto,-11.845730027548202,,,5.4529067147363826 +Qwen/Qwen2.5-7B-Instruct,H100-80GB,1,1,1,8192,bfloat16,,fp8,-0.42105263157895084,,,-4.220058089868441 +Qwen/Qwen2.5-72B-Instruct,H100-80GB,2,1,1,8192,auto,,auto,-0.13274336283186344,,,60.855263157894726 +Qwen/Qwen2.5-72B-Instruct,H100-80GB,4,1,1,8192,auto,,auto,-0.35314891112418323,,,9.254218835057154 +Qwen/Qwen2.5-7B-Instruct,H100-80GB,1,1,1,8192,bfloat16,,auto,-0.42105263157895084,,,-4.220058089868441 +Qwen/Qwen2.5-7B-Instruct,H100-80GB,1,1,1,8192,float16,,auto,-0.42105263157895084,,,-4.220058089868441 +Qwen/Qwen2.5-7B-Instruct,H100-80GB,1,1,1,16384,auto,,auto,-0.42105263157895084,,,-4.220058089868441 +Qwen/Qwen2.5-7B-Instruct,H100-80GB,1,1,1,32768,auto,,auto,-0.42105263157895084,,,-4.220058089868441 +Qwen/Qwen2.5-7B-Instruct,H100-80GB,1,1,1,2048,auto,,auto,-0.42105263157895084,,,-4.220058089868441 +Qwen/Qwen2.5-7B-Instruct,H100-80GB,1,1,1,4096,auto,,auto,-0.42105263157895084,,,-4.220058089868441 +Qwen/Qwen2.5-7B-Instruct,H100-80GB,1,1,1,8192,auto,,auto,-0.42105263157895084,,,-4.220058089868441 +Qwen/Qwen2.5-7B-Instruct,H100-80GB,2,1,1,8192,auto,,auto,-0.42134831460674504,,,2.615914786967421 +Qwen/Qwen2.5-7B-Instruct,H100-80GB,4,1,1,8192,auto,,auto,0.0,,,4.618354618354617 +Qwen/Qwen3-30B-A3B,H100-80GB,1,1,1,8192,auto,,auto,-0.017580872011260754,,,-28.747566515249833 +Qwen/Qwen3-30B-A3B,H100-80GB,4,1,1,8192,auto,,auto,-0.2105263157894692,,,5.3587324194409796 +Qwen/Qwen3-8B,H100-80GB,1,1,1,8192,auto,,auto,-0.06548788474132146,,,-4.365217391304344 +Qwen/Qwen3-8B,H100-80GB,4,1,1,8192,auto,,auto,-0.26178010471203633,,,4.683072334079046 +RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8,H100-80GB,1,1,1,8192,float16,,auto,-0.3533568904593564,,,-3.1114551083591193 +redhatai/Llama-3.3-70B-Instruct-quantized.w8a8,H100-80GB,1,1,1,8192,auto,,auto,-0.05906674542231548,,,-32.73453093812375 +redhatai/Llama-3.3-70B-Instruct-quantized.w8a8,H100-80GB,2,1,1,8192,auto,,auto,-0.11806375442738827,,,5.042918454935609 +redhatai/Llama-3.3-70B-Instruct-quantized.w8a8,H100-80GB,4,1,1,8192,auto,,auto,-0.23584905660376854,,,5.917159763313615 +RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8,H100-80GB,1,1,1,8192,float16,,auto,-0.3533568904593564,,,-3.1114551083591193 +RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16,H100-80GB,1,1,1,8192,float16,,auto,-0.7434944237918222,,,-2.953773445576725 +RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8,H100-80GB,1,1,1,8192,auto,,auto,-0.16618196925633216,,,1.2107531294890281 +RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8,H100-80GB,2,1,1,8192,auto,,auto,-0.8257638315441755,,,5.286343612334797 +RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8,H100-80GB,1,1,1,8192,float16,,auto,-0.3533568904593564,,,-3.1114551083591193 +RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8,H100-80GB,1,1,1,8192,auto,,auto,-0.3685503685503825,,,-3.8675742574257423 diff --git a/accuracy/results/v0.19.0/runs/deepseek-ai-deepseek-v2---h100-80gb--tp1pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/deepseek-ai-deepseek-v2---h100-80gb--tp1pp1dp1--8192.json new file mode 100644 index 00000000..cfb90b25 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/deepseek-ai-deepseek-v2---h100-80gb--tp1pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "deepseek-ai/DeepSeek-V2-Lite-Chat", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T21:50:52.236219+00:00", + "log_path": "/data/results/v0.19.0/logs/deepseek-ai-deepseek-v2---h100-80gb--tp1pp1dp1--8192.log", + "weight_memory_gib": 29.43, + "kv_cache_memory_gib": 43.61, + "cuda_graph_memory_gib": 1.39, + "max_concurrency": 183.78, + "kv_cache_tokens": 1505552, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 94097, + "kv_block_size_bytes": 497634 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/deepseek-ai-deepseek-v2---h100-80gb--tp2pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/deepseek-ai-deepseek-v2---h100-80gb--tp2pp1dp1--8192.json new file mode 100644 index 00000000..5bb0c06a --- /dev/null +++ b/accuracy/results/v0.19.0/runs/deepseek-ai-deepseek-v2---h100-80gb--tp2pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "deepseek-ai/DeepSeek-V2-Lite-Chat", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 2, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T22:28:33.109772+00:00", + "log_path": "/data/results/v0.19.0/logs/deepseek-ai-deepseek-v2---h100-80gb--tp2pp1dp1--8192.log", + "weight_memory_gib": 14.83, + "kv_cache_memory_gib": 56.45, + "cuda_graph_memory_gib": 1.33, + "max_concurrency": 237.88, + "kv_cache_tokens": 1948752, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 121797, + "kv_block_size_bytes": 497653 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/deepseek-ai-deepseek-v2---h100-80gb--tp4pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/deepseek-ai-deepseek-v2---h100-80gb--tp4pp1dp1--8192.json new file mode 100644 index 00000000..ace131ae --- /dev/null +++ b/accuracy/results/v0.19.0/runs/deepseek-ai-deepseek-v2---h100-80gb--tp4pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "deepseek-ai/DeepSeek-V2-Lite-Chat", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 4, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T21:53:44.981095+00:00", + "log_path": "/data/results/v0.19.0/logs/deepseek-ai-deepseek-v2---h100-80gb--tp4pp1dp1--8192.log", + "weight_memory_gib": 7.51, + "kv_cache_memory_gib": 63.73, + "cuda_graph_memory_gib": 1.11, + "max_concurrency": 268.54, + "kv_cache_tokens": 2199888, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 137493, + "kv_block_size_bytes": 497694 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/fp8dyn-llama-3-3-70b--h100-80gb--tp2pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/fp8dyn-llama-3-3-70b--h100-80gb--tp2pp1dp1--8192.json new file mode 100644 index 00000000..a1241ee2 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/fp8dyn-llama-3-3-70b--h100-80gb--tp2pp1dp1--8192.json @@ -0,0 +1,26 @@ +{ + "model": "RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 2, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-21T21:13:21.310192+00:00", + "log_path": "/data/results/v0.19.0/logs/fp8dyn-llama-3-3-70b--h100-80gb--tp2pp1dp1--8192.log", + "weight_memory_gib": 33.88, + "kv_cache_memory_gib": 37.28, + "cuda_graph_memory_gib": 1.82, + "max_concurrency": 29.82, + "kv_cache_tokens": 244304, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 15269, + "kv_block_size_bytes": 2621592, + "_sweep_dim": "quantization" +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/fp8dyn-llama-3-3-70b--h100-80gb--tp4pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/fp8dyn-llama-3-3-70b--h100-80gb--tp4pp1dp1--8192.json new file mode 100644 index 00000000..ca81cd8f --- /dev/null +++ b/accuracy/results/v0.19.0/runs/fp8dyn-llama-3-3-70b--h100-80gb--tp4pp1dp1--8192.json @@ -0,0 +1,26 @@ +{ + "model": "RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 4, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-21T21:15:52.962454+00:00", + "log_path": "/data/results/v0.19.0/logs/fp8dyn-llama-3-3-70b--h100-80gb--tp4pp1dp1--8192.log", + "weight_memory_gib": 16.96, + "kv_cache_memory_gib": 54.09, + "cuda_graph_memory_gib": 1.7, + "max_concurrency": 86.55, + "kv_cache_tokens": 708992, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 44312, + "kv_block_size_bytes": 1310676, + "_sweep_dim": "quantization" +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/ibm-granite-granite-3-1---h100-80gb--tp1pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/ibm-granite-granite-3-1---h100-80gb--tp1pp1dp1--8192.json new file mode 100644 index 00000000..e2ec2ac4 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/ibm-granite-granite-3-1---h100-80gb--tp1pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "ibm-granite/granite-3.1-2b-instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T21:39:00.017622+00:00", + "log_path": "/data/results/v0.19.0/logs/ibm-granite-granite-3-1---h100-80gb--tp1pp1dp1--8192.log", + "weight_memory_gib": 4.74, + "kv_cache_memory_gib": 69.28, + "cuda_graph_memory_gib": 1.6, + "max_concurrency": 110.85, + "kv_cache_tokens": 908048, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 56753, + "kv_block_size_bytes": 1310747 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/ibm-granite-granite-3-1---h100-80gb--tp2pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/ibm-granite-granite-3-1---h100-80gb--tp2pp1dp1--8192.json new file mode 100644 index 00000000..583b7598 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/ibm-granite-granite-3-1---h100-80gb--tp2pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "ibm-granite/granite-3.1-2b-instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 2, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T23:21:07.369990+00:00", + "log_path": "/data/results/v0.19.0/logs/ibm-granite-granite-3-1---h100-80gb--tp2pp1dp1--8192.log", + "weight_memory_gib": 2.38, + "kv_cache_memory_gib": 69.98, + "cuda_graph_memory_gib": 0.86, + "max_concurrency": 223.94, + "kv_cache_tokens": 1834528, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 114658, + "kv_block_size_bytes": 655344 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/ibm-granite-granite-3-1---h100-80gb--tp4pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/ibm-granite-granite-3-1---h100-80gb--tp4pp1dp1--8192.json new file mode 100644 index 00000000..ce404fc5 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/ibm-granite-granite-3-1---h100-80gb--tp4pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "ibm-granite/granite-3.1-8b-instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 4, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T21:42:00.445587+00:00", + "log_path": "/data/results/v0.19.0/logs/ibm-granite-granite-3-1---h100-80gb--tp4pp1dp1--8192.log", + "weight_memory_gib": 3.87, + "kv_cache_memory_gib": 68.39, + "cuda_graph_memory_gib": 0.87, + "max_concurrency": 218.86, + "kv_cache_tokens": 1792912, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 112057, + "kv_block_size_bytes": 655320 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/ibm-granite-granite-3-3---h100-80gb--tp1pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/ibm-granite-granite-3-3---h100-80gb--tp1pp1dp1--8192.json new file mode 100644 index 00000000..fc2317a3 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/ibm-granite-granite-3-3---h100-80gb--tp1pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "ibm-granite/granite-3.3-8b-instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T21:43:40.918313+00:00", + "log_path": "/data/results/v0.19.0/logs/ibm-granite-granite-3-3---h100-80gb--tp1pp1dp1--8192.log", + "weight_memory_gib": 15.25, + "kv_cache_memory_gib": 58.66, + "cuda_graph_memory_gib": 0.74, + "max_concurrency": 46.93, + "kv_cache_tokens": 384432, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 24027, + "kv_block_size_bytes": 2621454 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/ibm-granite-granite-3-3---h100-80gb--tp2pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/ibm-granite-granite-3-3---h100-80gb--tp2pp1dp1--8192.json new file mode 100644 index 00000000..3e2d8a38 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/ibm-granite-granite-3-3---h100-80gb--tp2pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "ibm-granite/granite-3.3-8b-instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 2, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T22:26:10.130728+00:00", + "log_path": "/data/results/v0.19.0/logs/ibm-granite-granite-3-3---h100-80gb--tp2pp1dp1--8192.log", + "weight_memory_gib": 7.64, + "kv_cache_memory_gib": 64.66, + "cuda_graph_memory_gib": 0.91, + "max_concurrency": 103.46, + "kv_cache_tokens": 847536, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 52971, + "kv_block_size_bytes": 1310682 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/ibm-granite-granite-3-3---h100-80gb--tp4pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/ibm-granite-granite-3-3---h100-80gb--tp4pp1dp1--8192.json new file mode 100644 index 00000000..ada4fa1a --- /dev/null +++ b/accuracy/results/v0.19.0/runs/ibm-granite-granite-3-3---h100-80gb--tp4pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "ibm-granite/granite-3.3-8b-instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 4, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T21:45:56.139495+00:00", + "log_path": "/data/results/v0.19.0/logs/ibm-granite-granite-3-3---h100-80gb--tp4pp1dp1--8192.log", + "weight_memory_gib": 3.87, + "kv_cache_memory_gib": 68.39, + "cuda_graph_memory_gib": 0.87, + "max_concurrency": 218.86, + "kv_cache_tokens": 1792912, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 112057, + "kv_block_size_bytes": 655320 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/ibm-granite-granite-visi--h100-80gb--tp1pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/ibm-granite-granite-visi--h100-80gb--tp1pp1dp1--8192.json new file mode 100644 index 00000000..3851fd83 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/ibm-granite-granite-visi--h100-80gb--tp1pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "ibm-granite/granite-vision-3.3-2b", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T21:47:26.617194+00:00", + "log_path": "/data/results/v0.19.0/logs/ibm-granite-granite-visi--h100-80gb--tp1pp1dp1--8192.log", + "weight_memory_gib": 5.54, + "kv_cache_memory_gib": 68.65, + "cuda_graph_memory_gib": 0.62, + "max_concurrency": 109.84, + "kv_cache_tokens": 899792, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 56237, + "kv_block_size_bytes": 1310745 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/ibm-granite-granite-visi--h100-80gb--tp2pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/ibm-granite-granite-visi--h100-80gb--tp2pp1dp1--8192.json new file mode 100644 index 00000000..8e2c76ff --- /dev/null +++ b/accuracy/results/v0.19.0/runs/ibm-granite-granite-visi--h100-80gb--tp2pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "ibm-granite/granite-vision-3.3-2b", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 2, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T23:02:29.029074+00:00", + "log_path": "/data/results/v0.19.0/logs/ibm-granite-granite-visi--h100-80gb--tp2pp1dp1--8192.log", + "weight_memory_gib": 2.79, + "kv_cache_memory_gib": 69.56, + "cuda_graph_memory_gib": 0.86, + "max_concurrency": 222.58, + "kv_cache_tokens": 1823408, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 113963, + "kv_block_size_bytes": 655383 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/meta-llama---h100-80gb--tp1pp1dp1--8192-dtbf16-kvfp8.json b/accuracy/results/v0.19.0/runs/meta-llama---h100-80gb--tp1pp1dp1--8192-dtbf16-kvfp8.json new file mode 100644 index 00000000..58f78b82 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/meta-llama---h100-80gb--tp1pp1dp1--8192-dtbf16-kvfp8.json @@ -0,0 +1,26 @@ +{ + "model": "meta-llama/Llama-3.1-8B-Instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "bfloat16", + "quantization": null, + "kv_cache_dtype": "fp8" + }, + "timestamp": "2026-04-21T02:55:12.485129+00:00", + "log_path": "/data/results/v0.19.0/logs/meta-llama---h100-80gb--tp1pp1dp1--8192-dtbf16-kvfp8.log", + "weight_memory_gib": 14.99, + "kv_cache_memory_gib": 58.11, + "cuda_graph_memory_gib": 0.84, + "max_concurrency": 116.21, + "kv_cache_tokens": 952032, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 59502, + "kv_block_size_bytes": 1048622, + "_sweep_dim": "kv_cache_dtype" +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/meta-llama-l--h100-80gb--tp1pp1dp1--8192-dtf16-kvfp8.json b/accuracy/results/v0.19.0/runs/meta-llama-l--h100-80gb--tp1pp1dp1--8192-dtf16-kvfp8.json new file mode 100644 index 00000000..cdbb807f --- /dev/null +++ b/accuracy/results/v0.19.0/runs/meta-llama-l--h100-80gb--tp1pp1dp1--8192-dtf16-kvfp8.json @@ -0,0 +1,4 @@ +{ + "skipped": true, + "reason": "--kv-cache-dtype=fp8 fails with FLASH_ATTN backend (exit 1). Consistent failure in vLLM v0.19.0 with VLLM_ATTENTION_BACKEND=FLASH_ATTN." +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/meta-llama-llama---h100-80gb--tp1pp1dp1--8192-dtbf16.json b/accuracy/results/v0.19.0/runs/meta-llama-llama---h100-80gb--tp1pp1dp1--8192-dtbf16.json new file mode 100644 index 00000000..9480f1eb --- /dev/null +++ b/accuracy/results/v0.19.0/runs/meta-llama-llama---h100-80gb--tp1pp1dp1--8192-dtbf16.json @@ -0,0 +1,26 @@ +{ + "model": "meta-llama/Llama-3.1-8B-Instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "bfloat16", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-21T01:01:23.671736+00:00", + "log_path": "/data/results/v0.19.0/logs/meta-llama-llama---h100-80gb--tp1pp1dp1--8192-dtbf16.log", + "weight_memory_gib": 14.99, + "kv_cache_memory_gib": 58.11, + "cuda_graph_memory_gib": 0.84, + "max_concurrency": 58.11, + "kv_cache_tokens": 476000, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 29750, + "kv_block_size_bytes": 2097315, + "_sweep_dim": "dtype" +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/meta-llama-llama-3--h100-80gb--tp1pp1dp1--8192-dtf16.json b/accuracy/results/v0.19.0/runs/meta-llama-llama-3--h100-80gb--tp1pp1dp1--8192-dtf16.json new file mode 100644 index 00000000..6cd5fe63 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/meta-llama-llama-3--h100-80gb--tp1pp1dp1--8192-dtf16.json @@ -0,0 +1,26 @@ +{ + "model": "meta-llama/Llama-3.1-8B-Instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "float16", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T23:28:07.659664+00:00", + "log_path": "/data/results/v0.19.0/logs/meta-llama-llama-3--h100-80gb--tp1pp1dp1--8192-dtf16.log", + "weight_memory_gib": 14.99, + "kv_cache_memory_gib": 58.11, + "cuda_graph_memory_gib": 0.84, + "max_concurrency": 58.11, + "kv_cache_tokens": 476016, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 29751, + "kv_block_size_bytes": 2097245, + "_sweep_dim": "kv_cache_dtype" +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/meta-llama-llama-3--h100-80gb--tp1pp1dp1--8192-dtf32.json b/accuracy/results/v0.19.0/runs/meta-llama-llama-3--h100-80gb--tp1pp1dp1--8192-dtf32.json new file mode 100644 index 00000000..1c23e3a7 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/meta-llama-llama-3--h100-80gb--tp1pp1dp1--8192-dtf32.json @@ -0,0 +1,26 @@ +{ + "model": "meta-llama/Llama-3.1-8B-Instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "float32", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-21T18:07:52.308611+00:00", + "log_path": "/data/results/v0.19.0/logs/meta-llama-llama-3--h100-80gb--tp1pp1dp1--8192-dtf32.log", + "weight_memory_gib": 29.98, + "kv_cache_memory_gib": 42.8, + "cuda_graph_memory_gib": 0.8, + "max_concurrency": 21.4, + "kv_cache_tokens": 175296, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 10956, + "kv_block_size_bytes": 4194610, + "_sweep_dim": "dtype" +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp1dp1--2048.json b/accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp1dp1--2048.json new file mode 100644 index 00000000..7738af00 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp1dp1--2048.json @@ -0,0 +1,26 @@ +{ + "model": "meta-llama/Llama-3.1-8B-Instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 2048, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T22:41:47.807000+00:00", + "log_path": "/data/results/v0.19.0/logs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp1dp1--2048.log", + "weight_memory_gib": 14.99, + "kv_cache_memory_gib": 58.11, + "cuda_graph_memory_gib": 0.84, + "max_concurrency": 232.43, + "kv_cache_tokens": 476016, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 29751, + "kv_block_size_bytes": 2097245, + "_sweep_dim": "max_model_len" +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp1dp1--4096.json b/accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp1dp1--4096.json new file mode 100644 index 00000000..d6981bc3 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp1dp1--4096.json @@ -0,0 +1,26 @@ +{ + "model": "meta-llama/Llama-3.1-8B-Instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 4096, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T23:03:46.668059+00:00", + "log_path": "/data/results/v0.19.0/logs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp1dp1--4096.log", + "weight_memory_gib": 14.99, + "kv_cache_memory_gib": 58.11, + "cuda_graph_memory_gib": 0.84, + "max_concurrency": 116.21, + "kv_cache_tokens": 476016, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 29751, + "kv_block_size_bytes": 2097245, + "_sweep_dim": "max_model_len" +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp1dp1--8192.json new file mode 100644 index 00000000..486dc5d7 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "meta-llama/Llama-3.1-8B-Instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T20:19:26.784121+00:00", + "log_path": "/data/results/v0.19.0/logs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp1dp1--8192.log", + "weight_memory_gib": 14.99, + "kv_cache_memory_gib": 58.11, + "cuda_graph_memory_gib": 0.84, + "max_concurrency": 58.11, + "kv_cache_tokens": 476000, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 29750, + "kv_block_size_bytes": 2097315 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp2dp1--8192.json b/accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp2dp1--8192.json new file mode 100644 index 00000000..63af0329 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp2dp1--8192.json @@ -0,0 +1,26 @@ +{ + "model": "meta-llama/Llama-3.1-8B-Instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 2, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T23:09:51.873997+00:00", + "log_path": "/data/results/v0.19.0/logs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp2dp1--8192.log", + "weight_memory_gib": 7.51, + "kv_cache_memory_gib": 66.54, + "cuda_graph_memory_gib": 0.54, + "max_concurrency": 131.26, + "kv_cache_tokens": 1075312, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 67207, + "kv_block_size_bytes": 1063085, + "_sweep_dim": "pp" +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp4dp1--8192.json b/accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp4dp1--8192.json new file mode 100644 index 00000000..b84ab660 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp4dp1--8192.json @@ -0,0 +1,26 @@ +{ + "model": "meta-llama/Llama-3.1-8B-Instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 4, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T23:11:06.507606+00:00", + "log_path": "/data/results/v0.19.0/logs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp4dp1--8192.log", + "weight_memory_gib": 4.26, + "kv_cache_memory_gib": 69.8, + "cuda_graph_memory_gib": 0.49, + "max_concurrency": 275.54, + "kv_cache_tokens": 2257264, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 141079, + "kv_block_size_bytes": 531242, + "_sweep_dim": "pp" +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp2pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp2pp1dp1--8192.json new file mode 100644 index 00000000..1dbaadc8 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp2pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "meta-llama/Llama-3.1-8B-Instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 2, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T20:20:37.229958+00:00", + "log_path": "/data/results/v0.19.0/logs/meta-llama-llama-3-1-8b---h100-80gb--tp2pp1dp1--8192.log", + "weight_memory_gib": 7.51, + "kv_cache_memory_gib": 63.76, + "cuda_graph_memory_gib": 1.69, + "max_concurrency": 127.53, + "kv_cache_tokens": 1044688, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 65293, + "kv_block_size_bytes": 1048531 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp3pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp3pp1dp1--8192.json new file mode 100644 index 00000000..632f4094 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp3pp1dp1--8192.json @@ -0,0 +1,4 @@ +{ + "skipped": true, + "reason": "tp=3 invalid: Llama-3.1-8B has 32 attention heads; 32 is not divisible by 3. vLLM validation error: 'Total number of attention heads (32) must be divisible by tensor parallel size (3)'. Valid tp values must be divisors of num_attention_heads." +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp4pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp4pp1dp1--8192.json new file mode 100644 index 00000000..e30241c1 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp4pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "meta-llama/Llama-3.1-8B-Instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 4, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T20:21:52.740796+00:00", + "log_path": "/data/results/v0.19.0/logs/meta-llama-llama-3-1-8b---h100-80gb--tp4pp1dp1--8192.log", + "weight_memory_gib": 3.77, + "kv_cache_memory_gib": 67.44, + "cuda_graph_memory_gib": 0.57, + "max_concurrency": 269.75, + "kv_cache_tokens": 2209792, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 138112, + "kv_block_size_bytes": 524307 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b--h100-80gb--tp1pp1dp1--16384.json b/accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b--h100-80gb--tp1pp1dp1--16384.json new file mode 100644 index 00000000..00c441fa --- /dev/null +++ b/accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b--h100-80gb--tp1pp1dp1--16384.json @@ -0,0 +1 @@ +{"skipped":true,"reason":"Consistent exit-1 for Llama-3.1-8B at max_model_len=16384; 32768 succeeds, cause unknown"} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b--h100-80gb--tp1pp1dp1--32768.json b/accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b--h100-80gb--tp1pp1dp1--32768.json new file mode 100644 index 00000000..8594e1ae --- /dev/null +++ b/accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b--h100-80gb--tp1pp1dp1--32768.json @@ -0,0 +1,26 @@ +{ + "model": "meta-llama/Llama-3.1-8B-Instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 32768, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T23:05:22.899035+00:00", + "log_path": "/data/results/v0.19.0/logs/meta-llama-llama-3-1-8b--h100-80gb--tp1pp1dp1--32768.log", + "weight_memory_gib": 14.99, + "kv_cache_memory_gib": 58.11, + "cuda_graph_memory_gib": 0.84, + "max_concurrency": 14.53, + "kv_cache_tokens": 476016, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 29751, + "kv_block_size_bytes": 2097245, + "_sweep_dim": "max_model_len" +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/meta-llama-llama-4-scout--h100-80gb--tp1pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/meta-llama-llama-4-scout--h100-80gb--tp1pp1dp1--8192.json new file mode 100644 index 00000000..60bc1b47 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/meta-llama-llama-4-scout--h100-80gb--tp1pp1dp1--8192.json @@ -0,0 +1 @@ +{"skipped":true,"reason":"OOM: model requires ~212GiB for tp1, exceeds 80GB H100"} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/meta-llama-llama-4-scout--h100-80gb--tp2pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/meta-llama-llama-4-scout--h100-80gb--tp2pp1dp1--8192.json new file mode 100644 index 00000000..8b13855b --- /dev/null +++ b/accuracy/results/v0.19.0/runs/meta-llama-llama-4-scout--h100-80gb--tp2pp1dp1--8192.json @@ -0,0 +1 @@ +{"skipped":true,"reason":"OOM: model requires ~106GiB for tp2, exceeds 80GB H100"} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/meta-llama-llama-4-scout--h100-80gb--tp4pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/meta-llama-llama-4-scout--h100-80gb--tp4pp1dp1--8192.json new file mode 100644 index 00000000..1d9df995 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/meta-llama-llama-4-scout--h100-80gb--tp4pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 4, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T20:46:22.143822+00:00", + "log_path": "/data/results/v0.19.0/logs/meta-llama-llama-4-scout--h100-80gb--tp4pp1dp1--8192.log", + "weight_memory_gib": 53.12, + "kv_cache_memory_gib": 16.75, + "cuda_graph_memory_gib": 1.09, + "max_concurrency": 44.68, + "kv_cache_tokens": 365968, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 22873, + "kv_block_size_bytes": 786305 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/microsoft-phi-4--h100-80gb--tp1pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/microsoft-phi-4--h100-80gb--tp1pp1dp1--8192.json new file mode 100644 index 00000000..1b88dd20 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/microsoft-phi-4--h100-80gb--tp1pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "microsoft/phi-4", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T21:34:47.924844+00:00", + "log_path": "/data/results/v0.19.0/logs/microsoft-phi-4--h100-80gb--tp1pp1dp1--8192.log", + "weight_memory_gib": 27.39, + "kv_cache_memory_gib": 46.08, + "cuda_graph_memory_gib": 0.81, + "max_concurrency": 29.49, + "kv_cache_tokens": 241568, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 15098, + "kv_block_size_bytes": 3277124 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/microsoft-phi-4--h100-80gb--tp2pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/microsoft-phi-4--h100-80gb--tp2pp1dp1--8192.json new file mode 100644 index 00000000..d0a767aa --- /dev/null +++ b/accuracy/results/v0.19.0/runs/microsoft-phi-4--h100-80gb--tp2pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "microsoft/phi-4", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 2, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T22:21:43.952333+00:00", + "log_path": "/data/results/v0.19.0/logs/microsoft-phi-4--h100-80gb--tp2pp1dp1--8192.log", + "weight_memory_gib": 13.77, + "kv_cache_memory_gib": 57.87, + "cuda_graph_memory_gib": 0.84, + "max_concurrency": 74.07, + "kv_cache_tokens": 606784, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 37924, + "kv_block_size_bytes": 1638472 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/microsoft-phi-4--h100-80gb--tp4pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/microsoft-phi-4--h100-80gb--tp4pp1dp1--8192.json new file mode 100644 index 00000000..8d4e98b6 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/microsoft-phi-4--h100-80gb--tp4pp1dp1--8192.json @@ -0,0 +1 @@ +{"skipped":true,"reason":"phi-4 tp4 consistently exits 1 — possible vLLM v0.19.0 incompatibility with phi-4 at tp4"} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/mistralai-mistral-small---h100-80gb--tp1pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/mistralai-mistral-small---h100-80gb--tp1pp1dp1--8192.json new file mode 100644 index 00000000..e37a96b9 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/mistralai-mistral-small---h100-80gb--tp1pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "mistralai/Mistral-Small-3.1-24B-Instruct-2503", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T22:19:12.919963+00:00", + "log_path": "/data/results/v0.19.0/logs/mistralai-mistral-small---h100-80gb--tp1pp1dp1--8192.log", + "weight_memory_gib": 44.76, + "kv_cache_memory_gib": 28.19, + "cuda_graph_memory_gib": 0.84, + "max_concurrency": 22.55, + "kv_cache_tokens": 184752, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 11547, + "kv_block_size_bytes": 2621354 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/mistralai-mistral-small---h100-80gb--tp2pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/mistralai-mistral-small---h100-80gb--tp2pp1dp1--8192.json new file mode 100644 index 00000000..923ca673 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/mistralai-mistral-small---h100-80gb--tp2pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "mistralai/Mistral-Small-3.1-24B-Instruct-2503", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 2, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T22:59:31.846151+00:00", + "log_path": "/data/results/v0.19.0/logs/mistralai-mistral-small---h100-80gb--tp2pp1dp1--8192.log", + "weight_memory_gib": 22.8, + "kv_cache_memory_gib": 48.33, + "cuda_graph_memory_gib": 0.81, + "max_concurrency": 77.32, + "kv_cache_tokens": 633408, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 39588, + "kv_block_size_bytes": 1310850 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/mistralai-mistral-small---h100-80gb--tp4pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/mistralai-mistral-small---h100-80gb--tp4pp1dp1--8192.json new file mode 100644 index 00000000..0a83ee5e --- /dev/null +++ b/accuracy/results/v0.19.0/runs/mistralai-mistral-small---h100-80gb--tp4pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "mistralai/Mistral-Small-3.1-24B-Instruct-2503", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 4, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T21:32:27.482816+00:00", + "log_path": "/data/results/v0.19.0/logs/mistralai-mistral-small---h100-80gb--tp4pp1dp1--8192.log", + "weight_memory_gib": 11.86, + "kv_cache_memory_gib": 59.18, + "cuda_graph_memory_gib": 0.9, + "max_concurrency": 189.37, + "kv_cache_tokens": 1551344, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 96959, + "kv_block_size_bytes": 655370 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/mistralai-mixtral-8x7b-i--h100-80gb--tp1pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/mistralai-mixtral-8x7b-i--h100-80gb--tp1pp1dp1--8192.json new file mode 100644 index 00000000..c8568d3c --- /dev/null +++ b/accuracy/results/v0.19.0/runs/mistralai-mixtral-8x7b-i--h100-80gb--tp1pp1dp1--8192.json @@ -0,0 +1 @@ +{"skipped":true,"reason":"OOM: model weight ~87GiB exceeds 80GB H100 at tp1"} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/mistralai-mixtral-8x7b-i--h100-80gb--tp2pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/mistralai-mixtral-8x7b-i--h100-80gb--tp2pp1dp1--8192.json new file mode 100644 index 00000000..f3cdf5fe --- /dev/null +++ b/accuracy/results/v0.19.0/runs/mistralai-mixtral-8x7b-i--h100-80gb--tp2pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 2, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T23:19:46.910442+00:00", + "log_path": "/data/results/v0.19.0/logs/mistralai-mixtral-8x7b-i--h100-80gb--tp2pp1dp1--8192.log", + "weight_memory_gib": 43.51, + "kv_cache_memory_gib": 28.44, + "cuda_graph_memory_gib": 0.82, + "max_concurrency": 56.88, + "kv_cache_tokens": 465936, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 29121, + "kv_block_size_bytes": 1048632 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/mistralai-mixtral-8x7b-i--h100-80gb--tp4pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/mistralai-mixtral-8x7b-i--h100-80gb--tp4pp1dp1--8192.json new file mode 100644 index 00000000..7aca8bbd --- /dev/null +++ b/accuracy/results/v0.19.0/runs/mistralai-mixtral-8x7b-i--h100-80gb--tp4pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 4, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T22:17:32.491507+00:00", + "log_path": "/data/results/v0.19.0/logs/mistralai-mixtral-8x7b-i--h100-80gb--tp4pp1dp1--8192.log", + "weight_memory_gib": 21.76, + "kv_cache_memory_gib": 50.43, + "cuda_graph_memory_gib": 0.96, + "max_concurrency": 201.71, + "kv_cache_tokens": 1652432, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 103277, + "kv_block_size_bytes": 524306 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/moonshotai-kimi-dev-72b--h100-80gb--tp2pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/moonshotai-kimi-dev-72b--h100-80gb--tp2pp1dp1--8192.json new file mode 100644 index 00000000..10fc0def --- /dev/null +++ b/accuracy/results/v0.19.0/runs/moonshotai-kimi-dev-72b--h100-80gb--tp2pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "moonshotai/Kimi-Dev-72B", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 2, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-21T02:30:07.865511+00:00", + "log_path": "/data/results/v0.19.0/logs/moonshotai-kimi-dev-72b--h100-80gb--tp2pp1dp1--8192.log", + "weight_memory_gib": 67.82, + "kv_cache_memory_gib": 3.02, + "cuda_graph_memory_gib": 1.63, + "max_concurrency": 2.41, + "kv_cache_tokens": 19776, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 1236, + "kv_block_size_bytes": 2623543 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/moonshotai-kimi-dev-72b--h100-80gb--tp4pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/moonshotai-kimi-dev-72b--h100-80gb--tp4pp1dp1--8192.json new file mode 100644 index 00000000..211158b9 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/moonshotai-kimi-dev-72b--h100-80gb--tp4pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "moonshotai/Kimi-Dev-72B", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 4, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-21T02:38:11.371351+00:00", + "log_path": "/data/results/v0.19.0/logs/moonshotai-kimi-dev-72b--h100-80gb--tp4pp1dp1--8192.log", + "weight_memory_gib": 34.01, + "kv_cache_memory_gib": 36.72, + "cuda_graph_memory_gib": 1.51, + "max_concurrency": 58.75, + "kv_cache_tokens": 481296, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 30081, + "kv_block_size_bytes": 1310721 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/moonshotai-kimi-vl-a3b-i--h100-80gb--tp1pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/moonshotai-kimi-vl-a3b-i--h100-80gb--tp1pp1dp1--8192.json new file mode 100644 index 00000000..2880d276 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/moonshotai-kimi-vl-a3b-i--h100-80gb--tp1pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "moonshotai/Kimi-VL-A3B-Instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-21T02:12:07.934555+00:00", + "log_path": "/data/results/v0.19.0/logs/moonshotai-kimi-vl-a3b-i--h100-80gb--tp1pp1dp1--8192.log", + "weight_memory_gib": 30.74, + "kv_cache_memory_gib": 41.32, + "cuda_graph_memory_gib": 1.19, + "max_concurrency": 174.12, + "kv_cache_tokens": 1426368, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 89148, + "kv_block_size_bytes": 497678 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/moonshotai-kimi-vl-a3b-i--h100-80gb--tp2pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/moonshotai-kimi-vl-a3b-i--h100-80gb--tp2pp1dp1--8192.json new file mode 100644 index 00000000..05ec09e3 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/moonshotai-kimi-vl-a3b-i--h100-80gb--tp2pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "moonshotai/Kimi-VL-A3B-Instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 2, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-21T02:13:58.377110+00:00", + "log_path": "/data/results/v0.19.0/logs/moonshotai-kimi-vl-a3b-i--h100-80gb--tp2pp1dp1--8192.log", + "weight_memory_gib": 15.53, + "kv_cache_memory_gib": 54.78, + "cuda_graph_memory_gib": 1.04, + "max_concurrency": 230.82, + "kv_cache_tokens": 1890896, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 118181, + "kv_block_size_bytes": 497707 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/openai-gpt-oss-120b--h100-80gb--tp4pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/openai-gpt-oss-120b--h100-80gb--tp4pp1dp1--8192.json new file mode 100644 index 00000000..d4c16582 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/openai-gpt-oss-120b--h100-80gb--tp4pp1dp1--8192.json @@ -0,0 +1 @@ +{"skipped":true} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/openai-gpt-oss-120b--h100-80gb--tp8pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/openai-gpt-oss-120b--h100-80gb--tp8pp1dp1--8192.json new file mode 100644 index 00000000..d4c16582 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/openai-gpt-oss-120b--h100-80gb--tp8pp1dp1--8192.json @@ -0,0 +1 @@ +{"skipped":true} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/openai-gpt-oss-20b--h100-80gb--tp4pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/openai-gpt-oss-20b--h100-80gb--tp4pp1dp1--8192.json new file mode 100644 index 00000000..da0029bc --- /dev/null +++ b/accuracy/results/v0.19.0/runs/openai-gpt-oss-20b--h100-80gb--tp4pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "openai/gpt-oss-20b", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 4, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T21:59:11.204478+00:00", + "log_path": "/data/results/v0.19.0/logs/openai-gpt-oss-20b--h100-80gb--tp4pp1dp1--8192.log", + "weight_memory_gib": 3.63, + "kv_cache_memory_gib": 66.57, + "cuda_graph_memory_gib": 1.09, + "max_concurrency": 709.4, + "kv_cache_tokens": 5817056, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 363566, + "kv_block_size_bytes": 196605 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/qwen-qwen-7b-chat--h100-80gb--tp1pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/qwen-qwen-7b-chat--h100-80gb--tp1pp1dp1--8192.json new file mode 100644 index 00000000..09ef0238 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/qwen-qwen-7b-chat--h100-80gb--tp1pp1dp1--8192.json @@ -0,0 +1 @@ +{"skipped":true,"reason":"Qwen-7B-Chat (v1 arch) incompatible with vLLM v0.19.0 — exits 1 immediately"} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/qwen-qwen-7b-chat--h100-80gb--tp2pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/qwen-qwen-7b-chat--h100-80gb--tp2pp1dp1--8192.json new file mode 100644 index 00000000..09ef0238 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/qwen-qwen-7b-chat--h100-80gb--tp2pp1dp1--8192.json @@ -0,0 +1 @@ +{"skipped":true,"reason":"Qwen-7B-Chat (v1 arch) incompatible with vLLM v0.19.0 — exits 1 immediately"} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/qwen-qwen-7b-chat--h100-80gb--tp4pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/qwen-qwen-7b-chat--h100-80gb--tp4pp1dp1--8192.json new file mode 100644 index 00000000..09ef0238 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/qwen-qwen-7b-chat--h100-80gb--tp4pp1dp1--8192.json @@ -0,0 +1 @@ +{"skipped":true,"reason":"Qwen-7B-Chat (v1 arch) incompatible with vLLM v0.19.0 — exits 1 immediately"} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/qwen-qwen2---h100-80gb--tp1pp1dp1--8192-dtbf16-kvfp8.json b/accuracy/results/v0.19.0/runs/qwen-qwen2---h100-80gb--tp1pp1dp1--8192-dtbf16-kvfp8.json new file mode 100644 index 00000000..b50f7569 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/qwen-qwen2---h100-80gb--tp1pp1dp1--8192-dtbf16-kvfp8.json @@ -0,0 +1,26 @@ +{ + "model": "Qwen/Qwen2.5-7B-Instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "bfloat16", + "quantization": null, + "kv_cache_dtype": "fp8" + }, + "timestamp": "2026-04-21T02:57:22.058570+00:00", + "log_path": "/data/results/v0.19.0/logs/qwen-qwen2---h100-80gb--tp1pp1dp1--8192-dtbf16-kvfp8.log", + "weight_memory_gib": 14.25, + "kv_cache_memory_gib": 58.53, + "cuda_graph_memory_gib": 0.61, + "max_concurrency": 267.58, + "kv_cache_tokens": 2192000, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 137000, + "kv_block_size_bytes": 458730, + "_sweep_dim": "kv_cache_dtype" +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/qwen-qwen2-5--h100-80gb--tp1pp1dp1--8192-dtf16-kvfp8.json b/accuracy/results/v0.19.0/runs/qwen-qwen2-5--h100-80gb--tp1pp1dp1--8192-dtf16-kvfp8.json new file mode 100644 index 00000000..cdbb807f --- /dev/null +++ b/accuracy/results/v0.19.0/runs/qwen-qwen2-5--h100-80gb--tp1pp1dp1--8192-dtf16-kvfp8.json @@ -0,0 +1,4 @@ +{ + "skipped": true, + "reason": "--kv-cache-dtype=fp8 fails with FLASH_ATTN backend (exit 1). Consistent failure in vLLM v0.19.0 with VLLM_ATTENTION_BACKEND=FLASH_ATTN." +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/qwen-qwen2-5-72b-instruc--h100-80gb--tp2pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/qwen-qwen2-5-72b-instruc--h100-80gb--tp2pp1dp1--8192.json new file mode 100644 index 00000000..958baf80 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/qwen-qwen2-5-72b-instruc--h100-80gb--tp2pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "Qwen/Qwen2.5-72B-Instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 2, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T21:09:59.841187+00:00", + "log_path": "/data/results/v0.19.0/logs/qwen-qwen2-5-72b-instruc--h100-80gb--tp2pp1dp1--8192.log", + "weight_memory_gib": 67.8, + "kv_cache_memory_gib": 3.04, + "cuda_graph_memory_gib": 1.63, + "max_concurrency": 2.43, + "kv_cache_tokens": 19920, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 1245, + "kv_block_size_bytes": 2621827 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/qwen-qwen2-5-72b-instruc--h100-80gb--tp4pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/qwen-qwen2-5-72b-instruc--h100-80gb--tp4pp1dp1--8192.json new file mode 100644 index 00000000..080bebde --- /dev/null +++ b/accuracy/results/v0.19.0/runs/qwen-qwen2-5-72b-instruc--h100-80gb--tp4pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "Qwen/Qwen2.5-72B-Instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 4, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T21:13:21.782090+00:00", + "log_path": "/data/results/v0.19.0/logs/qwen-qwen2-5-72b-instruc--h100-80gb--tp4pp1dp1--8192.log", + "weight_memory_gib": 33.98, + "kv_cache_memory_gib": 36.74, + "cuda_graph_memory_gib": 1.51, + "max_concurrency": 58.79, + "kv_cache_tokens": 481600, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 30100, + "kv_block_size_bytes": 1310607 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/qwen-qwen2-5-72b-instruc--h100-80gb--tp8pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/qwen-qwen2-5-72b-instruc--h100-80gb--tp8pp1dp1--8192.json new file mode 100644 index 00000000..d4c16582 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/qwen-qwen2-5-72b-instruc--h100-80gb--tp8pp1dp1--8192.json @@ -0,0 +1 @@ +{"skipped":true} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-i--h100-80gb--tp1pp1dp1--8192-dtbf16.json b/accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-i--h100-80gb--tp1pp1dp1--8192-dtbf16.json new file mode 100644 index 00000000..a7e94b0b --- /dev/null +++ b/accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-i--h100-80gb--tp1pp1dp1--8192-dtbf16.json @@ -0,0 +1,26 @@ +{ + "model": "Qwen/Qwen2.5-7B-Instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "bfloat16", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-21T02:56:13.220345+00:00", + "log_path": "/data/results/v0.19.0/logs/qwen-qwen2-5-7b-i--h100-80gb--tp1pp1dp1--8192-dtbf16.log", + "weight_memory_gib": 14.25, + "kv_cache_memory_gib": 58.53, + "cuda_graph_memory_gib": 0.61, + "max_concurrency": 133.79, + "kv_cache_tokens": 1096000, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 68500, + "kv_block_size_bytes": 917461, + "_sweep_dim": "kv_cache_dtype" +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-in--h100-80gb--tp1pp1dp1--8192-dtf16.json b/accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-in--h100-80gb--tp1pp1dp1--8192-dtf16.json new file mode 100644 index 00000000..211ec873 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-in--h100-80gb--tp1pp1dp1--8192-dtf16.json @@ -0,0 +1,26 @@ +{ + "model": "Qwen/Qwen2.5-7B-Instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "float16", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-21T01:03:25.334075+00:00", + "log_path": "/data/results/v0.19.0/logs/qwen-qwen2-5-7b-in--h100-80gb--tp1pp1dp1--8192-dtf16.log", + "weight_memory_gib": 14.25, + "kv_cache_memory_gib": 58.53, + "cuda_graph_memory_gib": 0.61, + "max_concurrency": 133.79, + "kv_cache_tokens": 1096000, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 68500, + "kv_block_size_bytes": 917461, + "_sweep_dim": "kv_cache_dtype" +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruc--h100-80gb--tp1pp1dp1--16384.json b/accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruc--h100-80gb--tp1pp1dp1--16384.json new file mode 100644 index 00000000..246a006a --- /dev/null +++ b/accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruc--h100-80gb--tp1pp1dp1--16384.json @@ -0,0 +1,26 @@ +{ + "model": "Qwen/Qwen2.5-7B-Instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 16384, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T23:08:14.790219+00:00", + "log_path": "/data/results/v0.19.0/logs/qwen-qwen2-5-7b-instruc--h100-80gb--tp1pp1dp1--16384.log", + "weight_memory_gib": 14.25, + "kv_cache_memory_gib": 58.53, + "cuda_graph_memory_gib": 0.61, + "max_concurrency": 66.89, + "kv_cache_tokens": 1095968, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 68498, + "kv_block_size_bytes": 917488, + "_sweep_dim": "max_model_len" +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruc--h100-80gb--tp1pp1dp1--32768.json b/accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruc--h100-80gb--tp1pp1dp1--32768.json new file mode 100644 index 00000000..3deef83c --- /dev/null +++ b/accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruc--h100-80gb--tp1pp1dp1--32768.json @@ -0,0 +1,26 @@ +{ + "model": "Qwen/Qwen2.5-7B-Instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 32768, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T23:23:49.457851+00:00", + "log_path": "/data/results/v0.19.0/logs/qwen-qwen2-5-7b-instruc--h100-80gb--tp1pp1dp1--32768.log", + "weight_memory_gib": 14.25, + "kv_cache_memory_gib": 58.53, + "cuda_graph_memory_gib": 0.61, + "max_concurrency": 33.45, + "kv_cache_tokens": 1096000, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 68500, + "kv_block_size_bytes": 917461, + "_sweep_dim": "max_model_len" +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruct--h100-80gb--tp1pp1dp1--2048.json b/accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruct--h100-80gb--tp1pp1dp1--2048.json new file mode 100644 index 00000000..5bdbf020 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruct--h100-80gb--tp1pp1dp1--2048.json @@ -0,0 +1,26 @@ +{ + "model": "Qwen/Qwen2.5-7B-Instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 2048, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T23:06:33.520598+00:00", + "log_path": "/data/results/v0.19.0/logs/qwen-qwen2-5-7b-instruct--h100-80gb--tp1pp1dp1--2048.log", + "weight_memory_gib": 14.25, + "kv_cache_memory_gib": 58.53, + "cuda_graph_memory_gib": 0.61, + "max_concurrency": 535.16, + "kv_cache_tokens": 1096000, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 68500, + "kv_block_size_bytes": 917461, + "_sweep_dim": "max_model_len" +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruct--h100-80gb--tp1pp1dp1--4096.json b/accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruct--h100-80gb--tp1pp1dp1--4096.json new file mode 100644 index 00000000..09b7660a --- /dev/null +++ b/accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruct--h100-80gb--tp1pp1dp1--4096.json @@ -0,0 +1,26 @@ +{ + "model": "Qwen/Qwen2.5-7B-Instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 4096, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T23:22:49.019986+00:00", + "log_path": "/data/results/v0.19.0/logs/qwen-qwen2-5-7b-instruct--h100-80gb--tp1pp1dp1--4096.log", + "weight_memory_gib": 14.25, + "kv_cache_memory_gib": 58.53, + "cuda_graph_memory_gib": 0.61, + "max_concurrency": 267.58, + "kv_cache_tokens": 1096000, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 68500, + "kv_block_size_bytes": 917461, + "_sweep_dim": "max_model_len" +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruct--h100-80gb--tp1pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruct--h100-80gb--tp1pp1dp1--8192.json new file mode 100644 index 00000000..34ed1a9b --- /dev/null +++ b/accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruct--h100-80gb--tp1pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "Qwen/Qwen2.5-7B-Instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T20:59:20.411713+00:00", + "log_path": "/data/results/v0.19.0/logs/qwen-qwen2-5-7b-instruct--h100-80gb--tp1pp1dp1--8192.log", + "weight_memory_gib": 14.25, + "kv_cache_memory_gib": 58.53, + "cuda_graph_memory_gib": 0.61, + "max_concurrency": 133.79, + "kv_cache_tokens": 1096000, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 68500, + "kv_block_size_bytes": 917461 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruct--h100-80gb--tp2pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruct--h100-80gb--tp2pp1dp1--8192.json new file mode 100644 index 00000000..f3f55750 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruct--h100-80gb--tp2pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "Qwen/Qwen2.5-7B-Instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 2, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T21:00:48.360905+00:00", + "log_path": "/data/results/v0.19.0/logs/qwen-qwen2-5-7b-instruct--h100-80gb--tp2pp1dp1--8192.log", + "weight_memory_gib": 7.12, + "kv_cache_memory_gib": 63.84, + "cuda_graph_memory_gib": 0.61, + "max_concurrency": 291.82, + "kv_cache_tokens": 2390592, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 149412, + "kv_block_size_bytes": 458782 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruct--h100-80gb--tp4pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruct--h100-80gb--tp4pp1dp1--8192.json new file mode 100644 index 00000000..9c6e5c91 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruct--h100-80gb--tp4pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "Qwen/Qwen2.5-7B-Instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 4, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T21:02:09.395454+00:00", + "log_path": "/data/results/v0.19.0/logs/qwen-qwen2-5-7b-instruct--h100-80gb--tp4pp1dp1--8192.log", + "weight_memory_gib": 3.55, + "kv_cache_memory_gib": 67.34, + "cuda_graph_memory_gib": 0.69, + "max_concurrency": 615.63, + "kv_cache_tokens": 5043280, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 315205, + "kv_block_size_bytes": 229392 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/qwen-qwen3-30b-a3b--h100-80gb--tp1pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/qwen-qwen3-30b-a3b--h100-80gb--tp1pp1dp1--8192.json new file mode 100644 index 00000000..5c0fe347 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/qwen-qwen3-30b-a3b--h100-80gb--tp1pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "Qwen/Qwen3-30B-A3B", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T22:55:59.089057+00:00", + "log_path": "/data/results/v0.19.0/logs/qwen-qwen3-30b-a3b--h100-80gb--tp1pp1dp1--8192.log", + "weight_memory_gib": 56.88, + "kv_cache_memory_gib": 15.41, + "cuda_graph_memory_gib": 1.24, + "max_concurrency": 20.54, + "kv_cache_tokens": 168288, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 10518, + "kv_block_size_bytes": 1573147 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/qwen-qwen3-30b-a3b--h100-80gb--tp2pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/qwen-qwen3-30b-a3b--h100-80gb--tp2pp1dp1--8192.json new file mode 100644 index 00000000..50ba7cf0 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/qwen-qwen3-30b-a3b--h100-80gb--tp2pp1dp1--8192.json @@ -0,0 +1 @@ +{"skipped":true,"reason":"Consistent exit-1 on tp2; likely node-specific NCCL/NVLink topology issue"} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/qwen-qwen3-30b-a3b--h100-80gb--tp4pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/qwen-qwen3-30b-a3b--h100-80gb--tp4pp1dp1--8192.json new file mode 100644 index 00000000..bfeb7acd --- /dev/null +++ b/accuracy/results/v0.19.0/runs/qwen-qwen3-30b-a3b--h100-80gb--tp4pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "Qwen/Qwen3-30B-A3B", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 4, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T22:11:04.980065+00:00", + "log_path": "/data/results/v0.19.0/logs/qwen-qwen3-30b-a3b--h100-80gb--tp4pp1dp1--8192.log", + "weight_memory_gib": 14.25, + "kv_cache_memory_gib": 56.17, + "cuda_graph_memory_gib": 1.26, + "max_concurrency": 299.57, + "kv_cache_tokens": 2454064, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 153379, + "kv_block_size_bytes": 393222 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/qwen-qwen3-8b--h100-80gb--tp1pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/qwen-qwen3-8b--h100-80gb--tp1pp1dp1--8192.json new file mode 100644 index 00000000..6ea76dc3 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/qwen-qwen3-8b--h100-80gb--tp1pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "Qwen/Qwen3-8B", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T21:20:15.407571+00:00", + "log_path": "/data/results/v0.19.0/logs/qwen-qwen3-8b--h100-80gb--tp1pp1dp1--8192.log", + "weight_memory_gib": 15.27, + "kv_cache_memory_gib": 57.5, + "cuda_graph_memory_gib": 0.84, + "max_concurrency": 51.11, + "kv_cache_tokens": 418688, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 26168, + "kv_block_size_bytes": 2359376 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/qwen-qwen3-8b--h100-80gb--tp2pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/qwen-qwen3-8b--h100-80gb--tp2pp1dp1--8192.json new file mode 100644 index 00000000..50ba7cf0 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/qwen-qwen3-8b--h100-80gb--tp2pp1dp1--8192.json @@ -0,0 +1 @@ +{"skipped":true,"reason":"Consistent exit-1 on tp2; likely node-specific NCCL/NVLink topology issue"} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/qwen-qwen3-8b--h100-80gb--tp4pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/qwen-qwen3-8b--h100-80gb--tp4pp1dp1--8192.json new file mode 100644 index 00000000..3050f605 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/qwen-qwen3-8b--h100-80gb--tp4pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "Qwen/Qwen3-8B", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 4, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T21:22:36.073018+00:00", + "log_path": "/data/results/v0.19.0/logs/qwen-qwen3-8b--h100-80gb--tp4pp1dp1--8192.log", + "weight_memory_gib": 3.82, + "kv_cache_memory_gib": 67.05, + "cuda_graph_memory_gib": 0.86, + "max_concurrency": 238.41, + "kv_cache_tokens": 1953024, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 122064, + "kv_block_size_bytes": 589808 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/redhatai-llam--h100-80gb--tp1pp1dp1--8192-dtf16-qawq.json b/accuracy/results/v0.19.0/runs/redhatai-llam--h100-80gb--tp1pp1dp1--8192-dtf16-qawq.json new file mode 100644 index 00000000..7b7b3727 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/redhatai-llam--h100-80gb--tp1pp1dp1--8192-dtf16-qawq.json @@ -0,0 +1,4 @@ +{ + "skipped": true, + "reason": "--quantization=awq flag conflicts with pre-quantized w4a16 model (exit 1). Models auto-detect quantization; explicit flag fails in vLLM v0.19.0." +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/redhatai-llam--h100-80gb--tp1pp1dp1--8192-dtf16-qfp8.json b/accuracy/results/v0.19.0/runs/redhatai-llam--h100-80gb--tp1pp1dp1--8192-dtf16-qfp8.json new file mode 100644 index 00000000..dd351560 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/redhatai-llam--h100-80gb--tp1pp1dp1--8192-dtf16-qfp8.json @@ -0,0 +1,4 @@ +{ + "skipped": true, + "reason": "--quantization=fp8 flag conflicts with pre-quantized w8a8 model (exit 1). Models auto-detect quantization; explicit flag fails in vLLM v0.19.0." +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/redhatai-llama-3-1--h100-80gb--tp1pp1dp1--8192-dtf16.json b/accuracy/results/v0.19.0/runs/redhatai-llama-3-1--h100-80gb--tp1pp1dp1--8192-dtf16.json new file mode 100644 index 00000000..13be19c1 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/redhatai-llama-3-1--h100-80gb--tp1pp1dp1--8192-dtf16.json @@ -0,0 +1,26 @@ +{ + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "float16", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-21T03:07:10.014434+00:00", + "log_path": "/data/results/v0.19.0/logs/redhatai-llama-3-1--h100-80gb--tp1pp1dp1--8192-dtf16.log", + "weight_memory_gib": 8.49, + "kv_cache_memory_gib": 64.6, + "cuda_graph_memory_gib": 0.84, + "max_concurrency": 64.6, + "kv_cache_tokens": 529184, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 33074, + "kv_block_size_bytes": 2097228, + "_sweep_dim": "quantization" +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/redhatai-llama-3-3-70b-i--h100-80gb--tp1pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/redhatai-llama-3-3-70b-i--h100-80gb--tp1pp1dp1--8192.json new file mode 100644 index 00000000..92dcd29d --- /dev/null +++ b/accuracy/results/v0.19.0/runs/redhatai-llama-3-3-70b-i--h100-80gb--tp1pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "redhatai/Llama-3.3-70B-Instruct-quantized.w8a8", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T20:51:52.587968+00:00", + "log_path": "/data/results/v0.19.0/logs/redhatai-llama-3-3-70b-i--h100-80gb--tp1pp1dp1--8192.log", + "weight_memory_gib": 67.72, + "kv_cache_memory_gib": 5.01, + "cuda_graph_memory_gib": 1.98, + "max_concurrency": 2.0, + "kv_cache_tokens": 16416, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 1026, + "kv_block_size_bytes": 5243125 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/redhatai-llama-3-3-70b-i--h100-80gb--tp2pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/redhatai-llama-3-3-70b-i--h100-80gb--tp2pp1dp1--8192.json new file mode 100644 index 00000000..7f891b8f --- /dev/null +++ b/accuracy/results/v0.19.0/runs/redhatai-llama-3-3-70b-i--h100-80gb--tp2pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "redhatai/Llama-3.3-70B-Instruct-quantized.w8a8", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 2, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T20:54:29.656798+00:00", + "log_path": "/data/results/v0.19.0/logs/redhatai-llama-3-3-70b-i--h100-80gb--tp2pp1dp1--8192.log", + "weight_memory_gib": 33.88, + "kv_cache_memory_gib": 37.28, + "cuda_graph_memory_gib": 1.92, + "max_concurrency": 29.82, + "kv_cache_tokens": 244304, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 15269, + "kv_block_size_bytes": 2621592 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/redhatai-llama-3-3-70b-i--h100-80gb--tp4pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/redhatai-llama-3-3-70b-i--h100-80gb--tp4pp1dp1--8192.json new file mode 100644 index 00000000..c19edce7 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/redhatai-llama-3-3-70b-i--h100-80gb--tp4pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "redhatai/Llama-3.3-70B-Instruct-quantized.w8a8", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 4, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-20T20:56:54.307875+00:00", + "log_path": "/data/results/v0.19.0/logs/redhatai-llama-3-3-70b-i--h100-80gb--tp4pp1dp1--8192.log", + "weight_memory_gib": 16.96, + "kv_cache_memory_gib": 54.08, + "cuda_graph_memory_gib": 1.8, + "max_concurrency": 86.53, + "kv_cache_tokens": 708880, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 44305, + "kv_block_size_bytes": 1310641 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/redhatai-meta-llam--h100-80gb--tp1pp1dp1--8192-dtf16.json b/accuracy/results/v0.19.0/runs/redhatai-meta-llam--h100-80gb--tp1pp1dp1--8192-dtf16.json new file mode 100644 index 00000000..6a3fbd33 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/redhatai-meta-llam--h100-80gb--tp1pp1dp1--8192-dtf16.json @@ -0,0 +1,26 @@ +{ + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "float16", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-21T03:00:53.311382+00:00", + "log_path": "/data/results/v0.19.0/logs/redhatai-meta-llam--h100-80gb--tp1pp1dp1--8192-dtf16.log", + "weight_memory_gib": 8.49, + "kv_cache_memory_gib": 64.6, + "cuda_graph_memory_gib": 0.84, + "max_concurrency": 64.6, + "kv_cache_tokens": 529200, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 33075, + "kv_block_size_bytes": 2097164, + "_sweep_dim": "quantization" +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/w4a16-redhatai-lla--h100-80gb--tp1pp1dp1--8192-dtf16.json b/accuracy/results/v0.19.0/runs/w4a16-redhatai-lla--h100-80gb--tp1pp1dp1--8192-dtf16.json new file mode 100644 index 00000000..471146f9 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/w4a16-redhatai-lla--h100-80gb--tp1pp1dp1--8192-dtf16.json @@ -0,0 +1,26 @@ +{ + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "float16", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-21T03:11:19.599063+00:00", + "log_path": "/data/results/v0.19.0/logs/w4a16-redhatai-lla--h100-80gb--tp1pp1dp1--8192-dtf16.log", + "weight_memory_gib": 5.38, + "kv_cache_memory_gib": 67.71, + "cuda_graph_memory_gib": 0.84, + "max_concurrency": 67.71, + "kv_cache_tokens": 554704, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 34669, + "kv_block_size_bytes": 2097062, + "_sweep_dim": "quantization" +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/w8a8-mistral-small-24b--h100-80gb--tp1pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/w8a8-mistral-small-24b--h100-80gb--tp1pp1dp1--8192.json new file mode 100644 index 00000000..d5b19a98 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/w8a8-mistral-small-24b--h100-80gb--tp1pp1dp1--8192.json @@ -0,0 +1,26 @@ +{ + "model": "RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-21T21:03:49.730325+00:00", + "log_path": "/data/results/v0.19.0/logs/w8a8-mistral-small-24b--h100-80gb--tp1pp1dp1--8192.log", + "weight_memory_gib": 24.07, + "kv_cache_memory_gib": 48.73, + "cuda_graph_memory_gib": 1.04, + "max_concurrency": 38.98, + "kv_cache_tokens": 319344, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 19959, + "kv_block_size_bytes": 2621546, + "_sweep_dim": "quantization" +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/w8a8-mistral-small-24b--h100-80gb--tp2pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/w8a8-mistral-small-24b--h100-80gb--tp2pp1dp1--8192.json new file mode 100644 index 00000000..f16fa160 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/w8a8-mistral-small-24b--h100-80gb--tp2pp1dp1--8192.json @@ -0,0 +1,26 @@ +{ + "model": "RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 2, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-21T21:05:50.185588+00:00", + "log_path": "/data/results/v0.19.0/logs/w8a8-mistral-small-24b--h100-80gb--tp2pp1dp1--8192.log", + "weight_memory_gib": 12.11, + "kv_cache_memory_gib": 59.02, + "cuda_graph_memory_gib": 1.6, + "max_concurrency": 94.43, + "kv_cache_tokens": 773584, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 48349, + "kv_block_size_bytes": 1310724, + "_sweep_dim": "quantization" +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/w8a8-redhatai-llam--h100-80gb--tp1pp1dp1--8192-dtf16.json b/accuracy/results/v0.19.0/runs/w8a8-redhatai-llam--h100-80gb--tp1pp1dp1--8192-dtf16.json new file mode 100644 index 00000000..f72fa949 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/w8a8-redhatai-llam--h100-80gb--tp1pp1dp1--8192-dtf16.json @@ -0,0 +1,26 @@ +{ + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "float16", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-21T03:09:59.174446+00:00", + "log_path": "/data/results/v0.19.0/logs/w8a8-redhatai-llam--h100-80gb--tp1pp1dp1--8192-dtf16.log", + "weight_memory_gib": 8.49, + "kv_cache_memory_gib": 64.6, + "cuda_graph_memory_gib": 0.84, + "max_concurrency": 64.6, + "kv_cache_tokens": 529184, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 33074, + "kv_block_size_bytes": 2097228, + "_sweep_dim": "quantization" +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/w8a8-redhatai-qwen2-5-7b--h100-80gb--tp1pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/w8a8-redhatai-qwen2-5-7b--h100-80gb--tp1pp1dp1--8192.json new file mode 100644 index 00000000..951861a7 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/w8a8-redhatai-qwen2-5-7b--h100-80gb--tp1pp1dp1--8192.json @@ -0,0 +1,26 @@ +{ + "model": "RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-21T21:17:29.114207+00:00", + "log_path": "/data/results/v0.19.0/logs/w8a8-redhatai-qwen2-5-7b--h100-80gb--tp1pp1dp1--8192.log", + "weight_memory_gib": 8.14, + "kv_cache_memory_gib": 64.64, + "cuda_graph_memory_gib": 0.81, + "max_concurrency": 147.74, + "kv_cache_tokens": 1210272, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 75642, + "kv_block_size_bytes": 917567, + "_sweep_dim": "quantization" +} \ No newline at end of file diff --git a/accuracy/scripts/analyze.py b/accuracy/scripts/analyze.py new file mode 100644 index 00000000..4061718e --- /dev/null +++ b/accuracy/scripts/analyze.py @@ -0,0 +1,276 @@ +""" +Aggregate per-run JSON files → CSV + Markdown report. + +Supports two input formats: + 1. Flat vLLM-log format (from parse_log.py / sweep_runner.py): + {"model": "...", "gpu": "H100-80GB", "vllm_args": {...}, + "weight_memory_gib": 14.99, "kv_cache_memory_gib": 58.11, ...} + analyze.py fetches model configs from HuggingFace and calls the + capacity planner to compute predictions automatically. + + 2. Pre-analyzed format (legacy): + {"model": "...", "tp": 1, ..., + "measured": {"weight_memory_gib": ..., "kv_cache_gib": ..., ...}, + "planner_predicted": {...}} + +Usage: + python analyze.py --runs accuracy/results/v0.19.0/runs/ \ + --out accuracy/results/v0.19.0/report.md \ + --csv accuracy/results/v0.19.0/results.csv \ + [--hf-token ] +""" +import argparse +import csv +import json +import re +import sys +from pathlib import Path +from typing import Any + +COMPONENTS = ["weight_memory", "activation_memory", "non_torch_memory", "kv_cache"] +_MKEYS = { + "weight_memory": "weight_memory_gib", + "activation_memory": "activation_memory_gib", + "non_torch_memory": "non_torch_memory_gib", + "kv_cache": "kv_cache_gib", +} + +# Known GPU memory sizes (GiB). _gpu_memory_gib() falls back to regex parsing. +_GPU_MEMORY_GIB: dict[str, int] = { + "H100-80GB": 80, "H100-40GB": 40, + "A100-80GB": 80, "A100-40GB": 40, + "L40S": 48, "L4": 24, "A10G": 24, "A10": 24, + "V100-32GB": 32, "V100-16GB": 16, +} + + +def _gpu_memory_gib(gpu_name: str) -> int: + if gpu_name in _GPU_MEMORY_GIB: + return _GPU_MEMORY_GIB[gpu_name] + m = re.search(r"(\d+)\s*GB", gpu_name, re.IGNORECASE) + if m: + return int(m.group(1)) + raise ValueError(f"Cannot determine GPU memory for: {gpu_name!r}") + + +def compute_planner_predictions(run: dict[str, Any], hf_token: str | None = None) -> dict[str, float]: + """Call the capacity planner for the given run's model + vllm_args.""" + from planner.capacity_planner import ( + allocatable_kv_cache_memory, + estimate_vllm_activation_memory, + estimate_vllm_cuda_graph_memory, + estimate_vllm_non_torch_memory, + get_model_config_from_hf, + per_gpu_model_memory_required, + ) + + model_name: str = run["model"] + va: dict = run.get("vllm_args", run) + tp = int(va.get("tensor_parallel_size", run.get("tp", 1))) + pp = int(va.get("pipeline_parallel_size", run.get("pp", 1))) + dp = int(va.get("data_parallel_size", run.get("dp", 1))) + max_model_len = int(va.get("max_model_len", run.get("max_model_len", 8192))) + gpu_util = float(va.get("gpu_memory_utilization", 0.9)) + gpu_memory = _gpu_memory_gib(run["gpu"]) + + model_config = get_model_config_from_hf(model_name, hf_token) + weight = per_gpu_model_memory_required(model_name, model_config, tp, pp, hf_token) + kv = allocatable_kv_cache_memory( + model_name, model_config, gpu_memory, gpu_util, + tp=tp, pp=pp, dp=dp, max_model_len=max_model_len, hf_token=hf_token, + ) + activation = estimate_vllm_activation_memory(model_config, tp=tp) + non_torch = estimate_vllm_non_torch_memory(tp) + cuda_graph = estimate_vllm_cuda_graph_memory() + + # allocatable_kv_cache_memory() returns total KV across all (tp×pp) GPUs. + # vLLM logs "Available KV cache memory" per GPU, so divide to match. + kv_per_gpu = kv / (tp * pp) + + return { + "weight_memory_gib": round(weight, 2), + "activation_memory_gib": round(activation, 2), + "non_torch_memory_gib": round(non_torch, 2), + "kv_cache_gib": round(kv_per_gpu, 2), + "kv_cache_total_gib": round(kv, 2), + "cuda_graph_memory_gib": round(cuda_graph, 2), + } + + +def _normalize_run(run: dict[str, Any], hf_token: str | None = None) -> dict[str, Any]: + """Convert flat vLLM-log format to analyzed format; no-op for pre-analyzed format.""" + if "measured" in run or "planner_predicted" in run: + return run + + va: dict = run.get("vllm_args", {}) + normalized: dict[str, Any] = { + "model": run["model"], + "gpu": run.get("gpu", "unknown"), + "tp": int(va.get("tensor_parallel_size", 1)), + "pp": int(va.get("pipeline_parallel_size", 1)), + "dp": int(va.get("data_parallel_size", 1)), + "max_model_len": int(va.get("max_model_len", 8192)), + "vllm_args": va, + } + if "_sweep_dim" in run: + normalized["_sweep_dim"] = run["_sweep_dim"] + + # activation_memory and non_torch_memory are not directly logged by vLLM + normalized["measured"] = { + "weight_memory_gib": run.get("weight_memory_gib"), + "kv_cache_gib": run.get("kv_cache_memory_gib"), + "activation_memory_gib": None, + "non_torch_memory_gib": None, + } + + try: + normalized["planner_predicted"] = compute_planner_predictions(run, hf_token) + except Exception as exc: + print( + f"Warning: planner prediction failed for {run['model']}: {exc}", + file=sys.stderr, + ) + + return normalized + + +def compute_error_pct(run: dict[str, Any]) -> dict[str, float | None]: + result: dict[str, float | None] = {} + for c in COMPONENTS: + key = _MKEYS[c] + measured = run["measured"].get(key) + predicted = run["planner_predicted"].get(key) + if measured is not None and predicted is not None and measured != 0: + result[c] = (predicted - measured) / measured * 100 + else: + result[c] = None + return result + + +def load_runs(directory: str | Path, hf_token: str | None = None) -> list[dict[str, Any]]: + runs = [] + for p in sorted(Path(directory).glob("*.json")): + data = json.loads(p.read_text()) + if data.get("skipped"): + continue + data = _normalize_run(data, hf_token) + if "error_pct" not in data and "planner_predicted" in data: + data["error_pct"] = compute_error_pct(data) + runs.append(data) + return runs + + +def find_outliers(runs: list[dict[str, Any]], threshold_pct: float = 10.0) -> list[dict[str, Any]]: + return [ + r for r in runs + if any( + v is not None and abs(v) > threshold_pct + for v in r.get("error_pct", {}).values() + ) + ] + + +def _fmt(v: float | None) -> str: + if v is None: + return "—" + return f"{'+'if v>0 else ''}{v:.1f}%" + + +def generate_markdown_report(runs: list[dict[str, Any]]) -> str: + lines = ["# Memory Validation Report\n"] + + lines += ["## Per-component error\n", + "| Model | TP | PP | DP | max_len | Weight | Activation | Non-torch | KV cache |", + "|---|---|---|---|---|---|---|---|---|"] + for r in runs: + e = r.get("error_pct", {}) + lines.append( + f"| {r['model']} | {r['tp']} | {r['pp']} | {r['dp']} | {r['max_model_len']} " + f"| {_fmt(e.get('weight_memory'))} | {_fmt(e.get('activation_memory'))} " + f"| {_fmt(e.get('non_torch_memory'))} | {_fmt(e.get('kv_cache'))} |" + ) + lines.append("") + + lines += ["## Per-architecture error\n", + "_Group by architecture class. Mean and max absolute error per component._\n"] + + lines.append("## Argument sensitivity\n") + + def _sweep_val(r: dict, dim: str) -> Any: + if dim in r: + return r[dim] + return r.get("vllm_args", {}).get(dim, "?") + + for sweep_dim in ("max_model_len", "tp", "pp", "dp", "dtype", "quantization", "kv_cache_dtype"): + sweep_runs = [r for r in runs if r.get("_sweep_dim") == sweep_dim] + if sweep_runs: + lines += [f"### {sweep_dim} sweep\n", + "| Value | Weight | Activation | Non-torch | KV cache |", + "|---|---|---|---|---|"] + for r in sweep_runs: + e = r.get("error_pct", {}) + lines.append( + f"| {_sweep_val(r, sweep_dim)} " + f"| {_fmt(e.get('weight_memory'))} | {_fmt(e.get('activation_memory'))} " + f"| {_fmt(e.get('non_torch_memory'))} | {_fmt(e.get('kv_cache'))} |" + ) + lines.append("") + + lines.append("## Outliers\n") + outliers = find_outliers(runs) + if outliers: + for r in outliers: + bad = {k: v for k, v in r.get("error_pct", {}).items() if v is not None and abs(v) > 10} + lines.append(f"- **{r['model']}** (TP={r['tp']}): {bad} — root cause required") + else: + lines.append("_No outliers (all components within ±10%)._") + lines.append("") + + lines += ["## Calibration decisions\n", + "_Document constant changes here: old value → new value, evidence._\n"] + return "\n".join(lines) + + +def main() -> None: + ap = argparse.ArgumentParser() + ap.add_argument("--runs", required=True) + ap.add_argument("--out", required=True) + ap.add_argument("--csv", default=None) + ap.add_argument("--hf-token", default=None, + help="HuggingFace API token (needed for gated models)") + args = ap.parse_args() + + runs = load_runs(args.runs, hf_token=args.hf_token) + Path(args.out).parent.mkdir(parents=True, exist_ok=True) + Path(args.out).write_text(generate_markdown_report(runs)) + print(f"Report written to {args.out} ({len(runs)} runs)") + + if args.csv: + with open(args.csv, "w", newline="") as f: + w = csv.DictWriter(f, fieldnames=[ + "model", "gpu", "tp", "pp", "dp", "max_model_len", + "dtype", "quantization", "kv_cache_dtype", + "weight_error_pct", "activation_error_pct", + "non_torch_error_pct", "kv_cache_error_pct"]) + w.writeheader() + for r in runs: + e = r.get("error_pct", {}) + va = r.get("vllm_args", r) + w.writerow({ + "model": r["model"], "gpu": r["gpu"], + "tp": va.get("tensor_parallel_size", r.get("tp")), + "pp": va.get("pipeline_parallel_size", r.get("pp")), + "dp": va.get("data_parallel_size", r.get("dp")), + "max_model_len": va.get("max_model_len", r.get("max_model_len")), + "dtype": va.get("dtype", "auto"), + "quantization": va.get("quantization"), + "kv_cache_dtype": va.get("kv_cache_dtype", "auto"), + "weight_error_pct": e.get("weight_memory"), + "activation_error_pct": e.get("activation_memory"), + "non_torch_error_pct": e.get("non_torch_memory"), + "kv_cache_error_pct": e.get("kv_cache"), + }) + + +if __name__ == "__main__": + main() diff --git a/accuracy/scripts/collect.py b/accuracy/scripts/collect.py new file mode 100644 index 00000000..6cc8b419 --- /dev/null +++ b/accuracy/scripts/collect.py @@ -0,0 +1,153 @@ +""" +Pull vLLM memory validation results from the cluster PVC to a local directory. + +Creates a temporary busybox reader pod that mounts the PVC, uses kubectl cp +to copy logs/ and runs/ for the given vLLM version, then deletes the pod. + +Usage: + python accuracy/scripts/collect.py \ + [--vllm-version v0.19.0] # falls back to sweep.yaml vllm_image tag + [--sweep accuracy/scripts/sweep.yaml] + [--namespace llmdplanner] + [--pvc vllm-mem-data] + [--out data/benchmarks/memory/] + [--dry-run] + +Results land in: //logs/ and //runs/ +""" +import argparse +import json +import subprocess +import sys +import time +from pathlib import Path +from typing import Any + +import yaml + +READER_POD_NAME = "vllm-mem-reader" +POLL_INTERVAL_S = 2 +POD_READY_TIMEOUT_S = 60 + + +def _resolve_version(vllm_version_arg: str | None, sweep_path: str | None) -> str: + if vllm_version_arg: + return vllm_version_arg + if sweep_path: + config = yaml.safe_load(Path(sweep_path).read_text()) + image = config.get("defaults", {}).get("vllm_image", "") + if ":" in image: + return image.split(":")[-1] + print( + "Error: cannot resolve vLLM version. Provide --vllm-version or " + "--sweep pointing to a sweep.yaml with defaults.vllm_image set.", + file=sys.stderr, + ) + sys.exit(1) + + +def _reader_pod_manifest(namespace: str, pvc: str) -> dict[str, Any]: + return { + "apiVersion": "v1", + "kind": "Pod", + "metadata": {"name": READER_POD_NAME, "namespace": namespace}, + "spec": { + "restartPolicy": "Never", + "volumes": [{"name": "data", "persistentVolumeClaim": {"claimName": pvc}}], + "containers": [{ + "name": "reader", + "image": "busybox", + "command": ["sh", "-c", "sleep 3600"], + "volumeMounts": [{"name": "data", "mountPath": "/data"}], + }], + }, + } + + +def _kubectl(*args: str, check: bool = True) -> subprocess.CompletedProcess: + return subprocess.run(["kubectl", *args], capture_output=True, text=True, check=check) + + +def _wait_for_pod_running(namespace: str, timeout: int = POD_READY_TIMEOUT_S) -> None: + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + result = _kubectl("get", "pod", READER_POD_NAME, "-n", namespace, + "-o", "jsonpath={.status.phase}", check=False) + if result.stdout.strip() == "Running": + return + time.sleep(POLL_INTERVAL_S) + raise TimeoutError( + f"Reader pod did not reach Running within {timeout}s. " + f"Check: kubectl describe pod {READER_POD_NAME} -n {namespace}" + ) + + +def collect( + vllm_version: str, + namespace: str, + pvc: str, + out: str, + dry_run: bool = False, +) -> None: + src_base = f"/data/results/{vllm_version}" + dst_base = Path(out) / vllm_version + + for subdir in ("logs", "runs"): + src = f"{READER_POD_NAME}:{src_base}/{subdir}/" + dst = dst_base / subdir + print(f" {'[dry-run] ' if dry_run else ''}kubectl cp {src} → {dst}") + + if dry_run: + return + + manifest_json = json.dumps(_reader_pod_manifest(namespace, pvc)) + subprocess.run( + ["kubectl", "apply", "-f", "-", "-n", namespace], + input=manifest_json, text=True, check=True, + ) + + try: + print("Waiting for reader pod to be Running...", flush=True) + _wait_for_pod_running(namespace) + + for subdir in ("logs", "runs"): + src = f"{READER_POD_NAME}:{src_base}/{subdir}/" + dst = dst_base / subdir + dst.mkdir(parents=True, exist_ok=True) + print(f"Copying {subdir}...", flush=True) + _kubectl("cp", "-n", namespace, src, str(dst)) + + finally: + print("Deleting reader pod...", flush=True) + _kubectl("delete", "pod", READER_POD_NAME, "-n", namespace, + "--ignore-not-found", check=False) + + runs = list((dst_base / "runs").glob("*.json")) + logs = list((dst_base / "logs").glob("*.log")) + print(f"\nCollected {len(runs)} JSON results and {len(logs)} logs → {dst_base}") + + +def main() -> None: + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("--vllm-version", default=None) + ap.add_argument("--sweep", default="accuracy/scripts/sweep.yaml") + ap.add_argument("--namespace", default="llmdplanner") + ap.add_argument("--pvc", default="vllm-mem-data") + ap.add_argument("--out", default="data/benchmarks/memory/") + ap.add_argument("--dry-run", action="store_true") + args = ap.parse_args() + + version = _resolve_version(args.vllm_version, args.sweep) + print(f"vLLM version: {version}", flush=True) + + collect( + vllm_version=version, + namespace=args.namespace, + pvc=args.pvc, + out=args.out, + dry_run=args.dry_run, + ) + + +if __name__ == "__main__": + main() diff --git a/accuracy/scripts/deep_analysis.py b/accuracy/scripts/deep_analysis.py new file mode 100644 index 00000000..985d1365 --- /dev/null +++ b/accuracy/scripts/deep_analysis.py @@ -0,0 +1,397 @@ +""" +Deep percent-error analysis of capacity-planner predictions vs vLLM measurements. + +Reads results.csv produced by analyze.py and writes a detailed markdown report +broken down by model, model family, TP degree, and quantization. + +Usage: + python deep_analysis.py \ + --csv accuracy/results/v0.19.0/results.csv \ + --out accuracy/results/v0.19.0/deep_analysis.md +""" +import argparse +import csv +import math +import statistics +from collections import defaultdict +from pathlib import Path + + +# --------------------------------------------------------------------------- +# Data loading +# --------------------------------------------------------------------------- + +def load_csv(path: str) -> list[dict]: + rows = [] + for r in csv.DictReader(open(path)): + for f in ("tp", "pp", "dp", "max_model_len"): + if r[f]: + r[f] = int(r[f]) + for f in ("weight_error_pct", "activation_error_pct", + "non_torch_error_pct", "kv_cache_error_pct"): + r[f] = float(r[f]) if r[f] else None + rows.append(r) + return rows + + +def family(model: str) -> str: + """Coarse family label for grouping.""" + m = model.lower() + if "llama-4" in m: return "Llama-4" + if "llama-3.3" in m: return "Llama-3.3" + if "llama-3.1" in m or "llama-3-1" in m: return "Llama-3.1" + if "llama" in m: return "Llama (other)" + if "qwen3" in m: return "Qwen3" + if "qwen2.5" in m or "qwen2-5" in m: return "Qwen2.5" + if "qwen2" in m: return "Qwen2" + if "qwen" in m: return "Qwen (other)" + if "deepseek" in m: return "DeepSeek" + if "mistral-small" in m: return "Mistral-Small" + if "mixtral" in m: return "Mixtral" + if "phi" in m: return "Phi" + if "granite-vision" in m: return "Granite-Vision" + if "granite" in m: return "Granite" + if "kimi-vl" in m: return "Kimi-VL" + if "kimi" in m: return "Kimi" + if "gpt-oss" in m: return "GPT-OSS (openai)" + return model.split("/")[0] + + +def arch_type(model: str) -> str: + m = model.lower() + if any(x in m for x in ["mixtral", "qwen3-30b-a3b", "kimi-dev", "gpt-oss", + "deepseek-v2", "llama-4-scout"]): + return "MoE" + if any(x in m for x in ["granite-vision", "kimi-vl"]): + return "Multimodal" + return "Dense" + + +# --------------------------------------------------------------------------- +# Stats helpers +# --------------------------------------------------------------------------- + +def _stats(values: list[float]) -> dict: + if not values: + return {} + return { + "n": len(values), + "mean": statistics.mean(values), + "median": statistics.median(values), + "stdev": statistics.stdev(values) if len(values) > 1 else 0.0, + "min": min(values), + "max": max(values), + "mae": statistics.mean(abs(v) for v in values), + "within5": sum(1 for v in values if abs(v) <= 5) / len(values) * 100, + "within10": sum(1 for v in values if abs(v) <= 10) / len(values) * 100, + } + + +def fmt_pct(v: float | None, decimals: int = 1) -> str: + if v is None or (isinstance(v, float) and math.isnan(v)): + return "—" + sign = "+" if v > 0 else "" + return f"{sign}{v:.{decimals}f}%" + + +def fmt_stat_row(label: str, s: dict, field: str = "weight") -> str: + if not s: + return f"| {label} | — | — | — | — | — | — |" + return ( + f"| {label} | {s['n']} " + f"| {fmt_pct(s['mean'])} " + f"| {fmt_pct(s['mae'])} " + f"| {fmt_pct(s['min'])} / {fmt_pct(s['max'])} " + f"| {s['within5']:.0f}% " + f"| {s['within10']:.0f}% |" + ) + + +STAT_HEADER = ( + "| Cohort | N | Mean err | MAE | Min / Max | ≤5% | ≤10% |", + "|---|---|---|---|---|---|---|", +) + + +# --------------------------------------------------------------------------- +# Report sections +# --------------------------------------------------------------------------- + +def section_executive_summary(rows: list[dict]) -> list[str]: + w_vals = [r["weight_error_pct"] for r in rows if r["weight_error_pct"] is not None] + k_vals = [r["kv_cache_error_pct"] for r in rows if r["kv_cache_error_pct"] is not None] + ws = _stats(w_vals) + ks = _stats(k_vals) + + lines = [ + "## Executive Summary\n", + f"**Runs analyzed**: {len(rows)} across {len({r['model'] for r in rows})} models " + f"on {len({r['gpu'] for r in rows})} GPU type(s).\n", + "### Overall accuracy\n", + *STAT_HEADER, + fmt_stat_row("Weight memory", ws), + fmt_stat_row("KV cache memory", ks), + "", + ] + return lines + + +def section_per_model(rows: list[dict]) -> list[str]: + lines = ["## Per-model breakdown\n"] + by_model = defaultdict(list) + for r in rows: + by_model[r["model"]].append(r) + + for model in sorted(by_model): + mrs = by_model[model] + fam = family(model) + atype = arch_type(model) + lines.append(f"### {model} _{fam} · {atype}_\n") + lines += [ + "| TP | PP | DP | max_len | dtype | quant | kv_dtype " + "| Weight err | KV err |", + "|---|---|---|---|---|---|---|---|---|", + ] + for r in sorted(mrs, key=lambda x: (x["tp"], x["pp"], x["max_model_len"])): + lines.append( + f"| {r['tp']} | {r['pp']} | {r['dp']} | {r['max_model_len']} " + f"| {r['dtype'] or 'auto'} " + f"| {r['quantization'] or '—'} " + f"| {r['kv_cache_dtype'] or 'auto'} " + f"| {fmt_pct(r['weight_error_pct'])} " + f"| {fmt_pct(r['kv_cache_error_pct'])} |" + ) + lines.append("") + return lines + + +def section_per_family(rows: list[dict]) -> list[str]: + lines = ["## Per-model-family accuracy\n", *STAT_HEADER] + by_fam: dict[str, list] = defaultdict(list) + for r in rows: + by_fam[family(r["model"])].append(r) + + for fam in sorted(by_fam): + frows = by_fam[fam] + w_vals = [r["weight_error_pct"] for r in frows if r["weight_error_pct"] is not None] + k_vals = [r["kv_cache_error_pct"] for r in frows if r["kv_cache_error_pct"] is not None] + lines.append(fmt_stat_row(f"**{fam}** — weight", _stats(w_vals))) + lines.append(fmt_stat_row(f"**{fam}** — KV", _stats(k_vals))) + lines.append("") + return lines + + +def section_by_arch_type(rows: list[dict]) -> list[str]: + lines = ["## By architecture type\n", *STAT_HEADER] + by_type: dict[str, list] = defaultdict(list) + for r in rows: + by_type[arch_type(r["model"])].append(r) + + for atype in ("Dense", "MoE", "Multimodal"): + trows = by_type.get(atype, []) + if not trows: + continue + w_vals = [r["weight_error_pct"] for r in trows if r["weight_error_pct"] is not None] + k_vals = [r["kv_cache_error_pct"] for r in trows if r["kv_cache_error_pct"] is not None] + lines.append(fmt_stat_row(f"**{atype}** — weight", _stats(w_vals))) + lines.append(fmt_stat_row(f"**{atype}** — KV", _stats(k_vals))) + lines.append("") + return lines + + +def section_tp_sensitivity(rows: list[dict]) -> list[str]: + lines = [ + "## TP sensitivity\n", + "_KV cache error grouped by tensor-parallel degree (all models). " + "After applying the per-GPU normalisation (÷TP×PP)._\n", + *STAT_HEADER, + ] + by_tp: dict[int, list] = defaultdict(list) + for r in rows: + if r["kv_cache_error_pct"] is not None: + by_tp[r["tp"]].append(r["kv_cache_error_pct"]) + for tp in sorted(by_tp): + lines.append(fmt_stat_row(f"TP={tp}", _stats(by_tp[tp]))) + lines.append("") + + lines += [ + "## PP sensitivity\n", + "_KV cache error grouped by pipeline-parallel degree._\n", + *STAT_HEADER, + ] + by_pp: dict[int, list] = defaultdict(list) + for r in rows: + if r["kv_cache_error_pct"] is not None: + by_pp[r["pp"]].append(r["kv_cache_error_pct"]) + for pp in sorted(by_pp): + lines.append(fmt_stat_row(f"PP={pp}", _stats(by_pp[pp]))) + lines.append("") + return lines + + +def section_context_len_sensitivity(rows: list[dict]) -> list[str]: + # Only include runs from models that were tested across multiple context lengths + tested = defaultdict(list) + for r in rows: + if r["tp"] == 1 and r["pp"] == 1: + tested[r["model"]].append(r) + multi = {m: rs for m, rs in tested.items() if len({r["max_model_len"] for r in rs}) > 1} + + if not multi: + return [] + + lines = [ + "## Context-length sensitivity (TP=1 runs only)\n", + "_Models tested at multiple max_model_len values. " + "KV cache error should be constant if the formula is context-length-agnostic._\n", + ] + for model in sorted(multi): + lines.append(f"**{model}**\n") + lines += ["| max_len | KV err |", "|---|---|"] + for r in sorted(multi[model], key=lambda x: x["max_model_len"]): + lines.append(f"| {r['max_model_len']} | {fmt_pct(r['kv_cache_error_pct'])} |") + lines.append("") + return lines + + +def section_quantization(rows: list[dict]) -> list[str]: + quant_rows = [r for r in rows if r["quantization"]] + if not quant_rows: + return [] + lines = [ + "## Quantization\n", + "| Model | Quant | TP | Weight err | KV err |", + "|---|---|---|---|---|", + ] + for r in sorted(quant_rows, key=lambda x: (x["model"], x["quantization"], x["tp"])): + lines.append( + f"| {r['model']} | {r['quantization']} | {r['tp']} " + f"| {fmt_pct(r['weight_error_pct'])} | {fmt_pct(r['kv_cache_error_pct'])} |" + ) + lines.append("") + return lines + + +def section_outliers(rows: list[dict], threshold: float = 10.0) -> list[str]: + outliers = [ + r for r in rows + if (r["weight_error_pct"] is not None and abs(r["weight_error_pct"]) > threshold) + or (r["kv_cache_error_pct"] is not None and abs(r["kv_cache_error_pct"]) > threshold) + ] + lines = [f"## Outliers (|error| > {threshold:.0f}%)\n"] + if not outliers: + lines.append(f"_No outliers exceeding ±{threshold:.0f}%._\n") + return lines + + lines += [ + "| Model | TP | PP | Weight err | KV err | Likely cause |", + "|---|---|---|---|---|---|", + ] + for r in sorted(outliers, key=lambda x: abs(x["kv_cache_error_pct"] or 0), reverse=True): + we = r["weight_error_pct"] + ke = r["kv_cache_error_pct"] + m = r["model"].lower() + + cause = "unknown" + if ke is not None and ke < -20: + if "70b" in m or "72b" in m: + cause = "large model: activation constant may underestimate real overhead" + elif "30b" in m and "moe" in arch_type(r["model"]).lower() or "a3b" in m: + cause = "MoE: routing overhead not modeled in activation/KV budget" + else: + cause = "overhead underestimated; check activation/non-torch constants" + elif ke is not None and ke > 20: + if r["tp"] >= 2 or r["pp"] >= 2: + cause = "TP/PP residual: per-GPU normalisation may be imprecise" + else: + cause = "KV formula overestimates available budget" + if we is not None and abs(we) > 10: + if r["pp"] >= 4: + cause = "PP≥4: weight sharding formula incorrect for high PP" + elif "moe" in arch_type(r["model"]).lower() or "gpt-oss" in m or "llama-4" in m: + cause = "MoE/sparse model: shared expert / embedding memory not sharded by TP" + + lines.append( + f"| {r['model']} | {r['tp']} | {r['pp']} " + f"| {fmt_pct(we)} | {fmt_pct(ke)} | {cause} |" + ) + lines.append("") + return lines + + +def section_calibration_notes(rows: list[dict]) -> list[str]: + w_vals = [r["weight_error_pct"] for r in rows if r["weight_error_pct"] is not None] + k_vals = [r["kv_cache_error_pct"] for r in rows if r["kv_cache_error_pct"] is not None] + + tp1 = [r for r in rows if r["tp"] == 1 and r["pp"] == 1] + k_tp1 = [r["kv_cache_error_pct"] for r in tp1 if r["kv_cache_error_pct"] is not None] + + lines = [ + "## Calibration notes\n", + "### Weight memory\n", + f"- Mean error {fmt_pct(statistics.mean(w_vals) if w_vals else None)} — " + "slightly negative (planner underestimates). " + "Cause: safetensors metadata reports storage dtype; " + "actual in-memory size can differ due to alignment/padding.\n", + "- PP≥4 and certain MoE models show >10% weight error — " + "embedding and shared-expert tensors may not be sharded by TP/PP " + "as assumed by the formula.\n", + "### KV cache memory (TP=1)\n", + f"- TP=1 KV mean error {fmt_pct(statistics.mean(k_tp1) if k_tp1 else None)} " + f"(MAE {fmt_pct(statistics.mean(abs(v) for v in k_tp1) if k_tp1 else None)}). " + "Mostly within ±10%.\n", + "- Consistent negative bias across TP=1 configs suggests activation_memory " + "constant is slightly too high (over-reserves budget, leaving less for KV).\n", + "### KV cache memory (TP>1)\n", + "- After ÷(TP×PP) normalisation, errors are within ±10% for most models.\n", + "- Remaining positive bias at TP=2/4 is consistent with extra NCCL/all-gather " + "buffers not captured by non_torch constant.\n", + "### Large-model KV outliers\n", + "- `Qwen3-30B-A3B` (TP=1): −29%. MoE routing buffers consume more memory than modeled.\n", + "- `Llama-3.3-70B-w8a8` (TP=1): −33%. W8A8 quantization increases activation-memory " + "footprint (dequant workspace) not accounted for in constant.\n", + "- `Kimi-Dev-72B` (TP=2): +62%. Likely residual normalisation issue or " + "model-specific memory layout.\n", + "- `Qwen2.5-72B` (TP=2): +61%. Same pattern as Kimi-Dev-72B — " + "large model at TP=2 still shows excess after normalisation.\n", + ] + return lines + + +# --------------------------------------------------------------------------- +# Main +# --------------------------------------------------------------------------- + +def generate_report(rows: list[dict]) -> str: + parts: list[list[str]] = [ + ["# Capacity Planner — Deep Accuracy Analysis\n", + f"_vLLM v0.19.0 · H100-80GB · {len(rows)} runs · " + f"{len({r['model'] for r in rows})} models_\n"], + section_executive_summary(rows), + section_by_arch_type(rows), + section_per_family(rows), + section_tp_sensitivity(rows), + section_context_len_sensitivity(rows), + section_quantization(rows), + section_outliers(rows), + section_calibration_notes(rows), + section_per_model(rows), + ] + return "\n".join(line for section in parts for line in section) + + +def main() -> None: + ap = argparse.ArgumentParser() + ap.add_argument("--csv", required=True) + ap.add_argument("--out", required=True) + args = ap.parse_args() + + rows = load_csv(args.csv) + report = generate_report(rows) + Path(args.out).parent.mkdir(parents=True, exist_ok=True) + Path(args.out).write_text(report) + print(f"Deep analysis written to {args.out} ({len(rows)} rows)") + + +if __name__ == "__main__": + main() diff --git a/accuracy/scripts/parse_log.py b/accuracy/scripts/parse_log.py new file mode 100644 index 00000000..f96cb0ca --- /dev/null +++ b/accuracy/scripts/parse_log.py @@ -0,0 +1,100 @@ +""" +Parse a vLLM v0.19.0 startup log and extract memory quantities. + +Usage: + python parse_log.py # prints JSON to stdout + python parse_log.py --out + +Key lines captured (verified against real vLLM v0.19.0 logs): + Model loading took 14.99 GiB memory and 7.41 seconds + Available KV cache memory: 58.11 GiB + GPU KV cache size: 476,000 tokens + Maximum concurrency for 8,192 tokens per request: 58.11x + Estimated CUDA graph memory: 0.84 GiB total + +Derived fields: + kv_cache_blocks = kv_cache_tokens / 16 (vLLM default block_size=16) + kv_block_size_bytes = kv_cache_memory_gib * 2^30 / kv_cache_blocks +""" +import argparse +import json +import re +from pathlib import Path +from typing import Any + +_FLOAT_PATTERNS: dict[str, str] = { + "weight_memory_gib": r"Model loading took ([\d.]+) GiB memory", + "kv_cache_memory_gib": r"Available KV cache memory:\s*([\d.]+) GiB", + "cuda_graph_memory_gib": r"Estimated CUDA graph memory:\s*([\d.]+) GiB", + "max_concurrency": r"Maximum concurrency for [\d,]+ tokens per request:\s*([\d.]+)x", +} +_INT_PATTERNS: dict[str, str] = { + # Comma-separated integer, e.g. "476,000" + "kv_cache_tokens": r"GPU KV cache size:\s*([\d,]+) tokens", +} +_OPT_STR_PATTERNS: dict[str, str] = { + "vllm_version": r"vLLM version:\s*(\S+)", + "vllm_commit": r"\(commit:\s*([0-9a-f]+)\)", +} + +_VLLM_BLOCK_SIZE = 16 # tokens per KV block, constant in vLLM v0.19.0 + + +def parse(log_path: str | Path) -> dict[str, Any]: + text = Path(log_path).read_text() + result: dict[str, Any] = {} + missing: list[str] = [] + + for field, pattern in _FLOAT_PATTERNS.items(): + m = re.search(pattern, text) + if m: + result[field] = float(m.group(1)) + else: + missing.append(field) + + for field, pattern in _INT_PATTERNS.items(): + m = re.search(pattern, text) + if m: + result[field] = int(m.group(1).replace(",", "")) + else: + missing.append(field) + + required = set(_FLOAT_PATTERNS) | set(_INT_PATTERNS) + required -= {"cuda_graph_memory_gib", "max_concurrency"} # optional + actual_missing = [f for f in missing if f in required] + if actual_missing: + raise ValueError( + f"Log {log_path} is missing required fields: {actual_missing}. " + "Confirm vLLM started cleanly (no OOM/offload errors)." + ) + + for field, pattern in _OPT_STR_PATTERNS.items(): + m = re.search(pattern, text) + result[field] = m.group(1) if m else None + + # Derived fields + if "kv_cache_tokens" in result: + result["kv_cache_blocks"] = result["kv_cache_tokens"] // _VLLM_BLOCK_SIZE + if "kv_cache_memory_gib" in result and "kv_cache_blocks" in result and result["kv_cache_blocks"] > 0: + result["kv_block_size_bytes"] = int( + result["kv_cache_memory_gib"] * (2**30) / result["kv_cache_blocks"] + ) + + return result + + +def main() -> None: + ap = argparse.ArgumentParser() + ap.add_argument("log_path") + ap.add_argument("--out", default=None) + args = ap.parse_args() + result = parse(args.log_path) + output = json.dumps(result, indent=2) + if args.out: + Path(args.out).write_text(output) + else: + print(output) + + +if __name__ == "__main__": + main() diff --git a/accuracy/scripts/sweep.yaml b/accuracy/scripts/sweep.yaml new file mode 100644 index 00000000..1a6b96b7 --- /dev/null +++ b/accuracy/scripts/sweep.yaml @@ -0,0 +1,123 @@ +# Canonical run matrix for the vLLM memory validation campaign. +# Edit defaults.node_selector to match your cluster's GPU node label. +# Run `kubectl get nodes --show-labels` to find the right label. +defaults: + gpu: H100-80GB + gpu_memory_utilization: "0.95" + max_model_len: 8192 + pp: 1 + dp: 1 + dtype: auto + kv_cache_dtype: auto + quantization: null + vllm_image: vllm/vllm-openai:v0.19.0 + namespace: llmdplanner + results_pvc: vllm-mem-data + node_selector: + # CHANGE THIS to match your cluster's GPU node label. + # Example alternatives: + # nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + # cloud.google.com/gke-accelerator: nvidia-h100-80gb + nvidia.com/gpu.product: NVIDIA-H100-80GB-HBM3 + +runs: + # ── Kimi models (moonshotai) ────────────────────────────────────────────── + - model: moonshotai/Kimi-VL-A3B-Instruct # 16B total, 3B active MoE, vision-language; num_attention_heads=16 + tp: [1, 2] + trust_remote_code: true + + # Kimi-K2-Instruct (1T) and Kimi-K2.6 (1.1T) removed — OOM on 8x H100 80GB (~640 GB < ~1 TB weights in FP8) + + - model: moonshotai/Kimi-Dev-72B # 72B dense, Qwen2 architecture; num_attention_heads=64 + tp: [2, 4] # tp=8 skipped — no 8-GPU nodes available + trust_remote_code: true + + # ── Argument sensitivity: data parallelism ──────────────────────────────── + - model: meta-llama/Llama-3.1-8B-Instruct + tp: 1 + dp: [1, 2] + _sweep_dim: dp + + # ── Argument sensitivity: --dtype ───────────────────────────────────────── + # Hold kv_cache_dtype fixed to avoid confounding KV cache precision. + # float32 is included to quantify the known planner gap: the planner reads + # the safetensors storage dtype (bf16) and never consults the --dtype flag, + # so fp32 will cause ~2× weight under-prediction and inflated KV estimate. + - model: meta-llama/Llama-3.1-8B-Instruct + tp: 1 + dtype: [float16, bfloat16, float32] + kv_cache_dtype: auto + _sweep_dim: dtype + + # ── Argument sensitivity: --quantization ────────────────────────────────── + - model: meta-llama/Llama-3.1-8B-Instruct # FP16 baseline (no quantization) + tp: 1 + dtype: float16 + kv_cache_dtype: auto + quantization: null + _sweep_dim: quantization + + - model: RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8 # FP8 weights (compressed-tensors format, auto-detected) + tp: 1 + dtype: float16 + kv_cache_dtype: auto + quantization: null + _label: w8a8-redhatai-llama-3-1-8b + _sweep_dim: quantization + + - model: RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16 # INT4 weights (compressed-tensors format, auto-detected) + tp: 1 + dtype: float16 + kv_cache_dtype: auto + quantization: null + _label: w4a16-redhatai-llama-3-1-8b + _sweep_dim: quantization + + # ── Extended quantization coverage ─────────────────────────────────────── + # Medium model (24B) w8a8: maps out where the activation-constant error becomes + # significant — between 8B (error negligible) and 70B (error dominant at TP=1). + - model: RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8 + tp: [1, 2] + _label: w8a8-mistral-small-24b + _sweep_dim: quantization + + # Large model fp8-dynamic: tests a different quantization format from w8a8. + # fp8 weights are 1 byte/param but activations may differ from int8. + # TP=1 skipped — ~65 GiB fp8 weights + overhead leaves <5 GiB KV on a single H100. + - model: RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic + tp: [2, 4] + _label: fp8dyn-llama-3-3-70b + _sweep_dim: quantization + + # Qwen2.5-7B quantized: compare against existing unquantized Qwen2.5-7B baseline. + # Both formats on the same small model isolate quantization effect from size/arch. + - model: RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8 + tp: 1 + _label: w8a8-redhatai-qwen2-5-7b + _sweep_dim: quantization + + - model: RedHatAI/Qwen2.5-7B-Instruct-fp8-dynamic + tp: 1 + _label: fp8dyn-redhatai-qwen2-5-7b + _sweep_dim: quantization + + # ── Argument sensitivity: --kv-cache-dtype ──────────────────────────────── + # FP8 KV cache halves per-token bytes → GPU block count should ~2×. + # FP8 KV cache requires bfloat16 compute dtype in vLLM v0.19.0 + - model: meta-llama/Llama-3.1-8B-Instruct + tp: 1 + dtype: bfloat16 + kv_cache_dtype: [auto, fp8] + _sweep_dim: kv_cache_dtype + + - model: Qwen/Qwen2.5-7B-Instruct + tp: 1 + dtype: bfloat16 + kv_cache_dtype: [auto, fp8] + _sweep_dim: kv_cache_dtype + + # ── Argument sensitivity: non-power-of-2 tp ────────────────────────────── + # Qwen3-14B: num_attention_heads=40, so tp=5 is valid (40 % 5 == 0). + - model: Qwen/Qwen3-14B + tp: 5 + _sweep_dim: tp_odd diff --git a/accuracy/scripts/sweep_runner.py b/accuracy/scripts/sweep_runner.py new file mode 100644 index 00000000..b8ca49b4 --- /dev/null +++ b/accuracy/scripts/sweep_runner.py @@ -0,0 +1,346 @@ +""" +In-cluster orchestrator: reads sweep.yaml, submits sub-jobs sequentially, +fetches startup logs when each pod is ready, and parses results to JSON. + +Designed to run inside a Kubernetes Job (in-cluster config). Requires the +vllm-mem-orchestrator ServiceAccount with Job+Pod+Pod/log RBAC. + +Usage (run by the orchestrator Job — scripts mounted at /scripts/ via ConfigMap): + python sweep_runner.py --config /sweep/sweep.yaml --results /data/results/ +""" +import argparse +import copy +import json +import re +import sys +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +import yaml + +NAMESPACE = "llmdplanner" + + +# ── Matrix expansion ────────────────────────────────────────────────────────── + +def _expandable_fields() -> list[str]: + return ["tp", "pp", "dp", "max_model_len", "dtype", "kv_cache_dtype"] + + +def expand_matrix(config: dict[str, Any]) -> list[dict[str, Any]]: + defaults = config.get("defaults", {}) + expanded = [] + for entry in config["runs"]: + base = {**defaults, **entry} + for field in _expandable_fields(): + if isinstance(base.get(field), list): + for val in base[field]: + run = copy.deepcopy(base) + run[field] = val + expanded.append(run) + break + else: + expanded.append(base) + return expanded + + +def make_run_id(run: dict[str, Any]) -> str: + model_slug = re.sub(r"[^a-z0-9]+", "-", (run.get("_label") or run["model"]).lower()).strip("-") + gpu_slug = re.sub(r"[^a-z0-9]+", "-", run["gpu"].lower()).strip("-") + params = f"tp{run['tp']}pp{run['pp']}dp{run['dp']}" + + # Append short discriminators for non-default sensitivity dimensions so + # that dtype/kv_cache_dtype/quantization sweeps get unique result files. + suffix = "" + dtype = run.get("dtype") or "auto" + if dtype != "auto": + suffix += "-dt" + dtype.replace("float", "f").replace("bfloat", "bf") # dtf16 / dtbf16 + kv = run.get("kv_cache_dtype") or "auto" + if kv != "auto": + suffix += "-kv" + kv.replace("float", "f").replace("bfloat", "bf") # kvf16 / kvfp8 + quant = run.get("quantization") + if quant: + suffix += f"-q{quant}" # qfp8 / qawq + + tail = f"--{gpu_slug}--{params}--{run['max_model_len']}{suffix}" + rid = f"{model_slug}{tail}" + if len(rid) > 52: + model_slug = model_slug[: len(model_slug) - (len(rid) - 52)] + rid = f"{model_slug}{tail}" + return rid.strip("-") + + +# ── K8s orchestration ───────────────────────────────────────────────────────── + +def _load_k8s() -> tuple[Any, Any]: + """Load in-cluster K8s config and return (BatchV1Api, CoreV1Api).""" + from kubernetes import client, config + config.load_incluster_config() + return client.BatchV1Api(), client.CoreV1Api() + + +def _build_job_manifest(run_id: str, run: dict[str, Any]) -> dict[str, Any]: + """Build a Job manifest dict for a single vLLM run.""" + num_gpus = run["tp"] * run["pp"] + node_selector = run.get("node_selector", {}) + return { + "apiVersion": "batch/v1", + "kind": "Job", + "metadata": { + "name": f"vllm-mem-{run_id}", + "namespace": run["namespace"], + "labels": {"app": "vllm-mem-validation", "run-id": run_id}, + }, + "spec": { + "backoffLimit": 0, + "activeDeadlineSeconds": 3600, + "template": { + "metadata": {"labels": {"app": "vllm-mem-validation", "run-id": run_id}}, + "spec": { + "restartPolicy": "Never", + "volumes": [{"name": "data", "persistentVolumeClaim": + {"claimName": run["results_pvc"]}}], + "containers": [{ + "name": "vllm", + "image": run["vllm_image"], + "command": ["vllm", "serve", run["model"]], + "args": [ + f"--tensor-parallel-size={run['tp']}", + f"--pipeline-parallel-size={run['pp']}", + f"--data-parallel-size={run['dp']}", + f"--gpu-memory-utilization={run['gpu_memory_utilization']}", + f"--max-model-len={run['max_model_len']}", + "--no-enable-prefix-caching", + # Argument-sensitivity fields — only emitted when non-default + *([f"--dtype={run['dtype']}"] if run.get("dtype") and run["dtype"] != "auto" else []), + *([f"--quantization={run['quantization']}"] if run.get("quantization") else []), + *([f"--kv-cache-dtype={run['kv_cache_dtype']}"] if run.get("kv_cache_dtype") and run["kv_cache_dtype"] != "auto" else []), + *(["--trust-remote-code"] if run.get("trust_remote_code") else []), + ], + "env": [ + {"name": "HF_TOKEN", "valueFrom": + {"secretKeyRef": {"name": "hf-token", "key": "token"}}}, + {"name": "HF_HOME", "value": "/data/models"}, + {"name": "HOME", "value": "/data"}, + {"name": "XDG_CACHE_HOME", "value": "/data/.cache"}, + {"name": "FLASHINFER_WORKSPACE_DIR", "value": "/data/.cache/flashinfer"}, + {"name": "VLLM_ATTENTION_BACKEND", "value": "FLASH_ATTN"}, + ], + "resources": { + "limits": {"nvidia.com/gpu": num_gpus}, + "requests": {"nvidia.com/gpu": num_gpus}, + }, + "volumeMounts": [{"name": "data", "mountPath": "/data"}], + "startupProbe": { + "httpGet": {"path": "/health", "port": 8000}, + "initialDelaySeconds": 60, + "periodSeconds": 10, + "failureThreshold": 180, # 30 minutes + "successThreshold": 1, + }, + }], + "nodeSelector": node_selector, + "tolerations": [{"key": "nvidia.com/gpu", "operator": "Exists", + "effect": "NoSchedule"}], + }, + }, + }, + } + + +def _submit_sub_job(batch_api: Any, run_id: str, run: dict[str, Any]) -> None: + from kubernetes import client + manifest = _build_job_manifest(run_id, run) + batch_api.create_namespaced_job(namespace=run["namespace"], body=manifest) + print(f" Submitted: vllm-mem-{run_id}", flush=True) + + +def _wait_for_pod_ready(core_api: Any, run_id: str, namespace: str, + timeout: int = 2400) -> str: + """Block until the pod's startupProbe passes. Returns the pod name.""" + from kubernetes import watch + w = watch.Watch() + label_sel = f"run-id={run_id}" + print(" Waiting for pod ready (startupProbe)...", flush=True) + for event in w.stream(core_api.list_namespaced_pod, + namespace=namespace, + label_selector=label_sel, + timeout_seconds=timeout): + pod = event["object"] + # Detect terminal failure immediately rather than waiting for timeout + if pod.status.phase == "Failed": + w.stop() + raise RuntimeError(f"Pod {pod.metadata.name} failed (phase=Failed)") + if pod.status.container_statuses: + for cs in pod.status.container_statuses: + if cs.ready: + w.stop() + print(f" Pod ready: {pod.metadata.name}", flush=True) + return pod.metadata.name + # Terminated with non-zero exit = OOM or crash + if cs.state and cs.state.terminated and cs.state.terminated.exit_code != 0: + w.stop() + reason = cs.state.terminated.reason or "unknown" + raise RuntimeError( + f"Pod {pod.metadata.name} terminated: {reason} " + f"(exit {cs.state.terminated.exit_code})" + ) + raise TimeoutError(f"Pod for run-id={run_id} did not become ready within {timeout}s") + + +def _fetch_pod_log(core_api: Any, pod_name: str, namespace: str) -> str: + return core_api.read_namespaced_pod_log(name=pod_name, namespace=namespace) + + +def _delete_job(batch_api: Any, run_id: str, namespace: str) -> None: + from kubernetes import client + job_name = f"vllm-mem-{run_id}" + batch_api.delete_namespaced_job( + name=job_name, namespace=namespace, + body=client.V1DeleteOptions(propagation_policy="Foreground"), + ) + print(f" Deleted Job: {job_name}", flush=True) + + +def run_sweep(runs: list[dict[str, Any]], results_dir: Path) -> None: + # In-cluster: parse_log is mounted at /scripts/ via ConfigMap. + # Locally: fall back to the sibling parse_log.py for testing. + try: + import parse_log as pl + except ModuleNotFoundError: + import importlib.util as _ilu + _spec = _ilu.spec_from_file_location("parse_log", Path(__file__).parent / "parse_log.py") + pl = _ilu.module_from_spec(_spec) # type: ignore[assignment] + _spec.loader.exec_module(pl) + + batch_api, core_api = _load_k8s() + logs_dir = results_dir / "logs" + runs_dir = results_dir / "runs" + logs_dir.mkdir(parents=True, exist_ok=True) + runs_dir.mkdir(parents=True, exist_ok=True) + + for i, run in enumerate(runs, 1): + run_id = make_run_id(run) + json_path = runs_dir / f"{run_id}.json" + log_path = logs_dir / f"{run_id}.log" + namespace = run.get("namespace", NAMESPACE) + + print(f"\n[{i}/{len(runs)}] {run_id}", flush=True) + + if json_path.exists(): + try: + existing = json.loads(json_path.read_text()) + if existing.get("skipped"): + print(" Re-running — previous result was a skipped placeholder.", flush=True) + else: + print(" Skipping — result already exists.", flush=True) + continue + except Exception: + print(" Skipping — result already exists.", flush=True) + continue + + pod_name_on_fail: str | None = None + try: + _submit_sub_job(batch_api, run_id, run) + pod_name = _wait_for_pod_ready(core_api, run_id, namespace) + log_text = _fetch_pod_log(core_api, pod_name, namespace) + log_path.write_text(log_text) + print(f" Log saved: {log_path}", flush=True) + except Exception as e: + print(f" Run failed: {e}", flush=True) + # Try to save failure log so the error is visible after pod deletion + try: + pods = core_api.list_namespaced_pod( + namespace=namespace, label_selector=f"run-id={run_id}" + ) + if pods.items: + fail_pod = pods.items[0].metadata.name + fail_log = _fetch_pod_log(core_api, fail_pod, namespace) + fail_path = logs_dir / f"{run_id}.FAILED.log" + fail_path.write_text(fail_log) + print(f" Failure log saved: {fail_path}", flush=True) + except Exception as log_err: + print(f" Could not save failure log: {log_err}", flush=True) + continue + finally: + try: + _delete_job(batch_api, run_id, namespace) + except Exception as e: + print(f" Warning: could not delete Job: {e}", flush=True) + + try: + parsed = pl.parse(log_path) + except ValueError as e: + print(f" Parse failed: {e}", flush=True) + continue + + record = { + "model": run["model"], + "gpu": run["gpu"], + "vllm_args": { + "tensor_parallel_size": run["tp"], + "pipeline_parallel_size": run["pp"], + "data_parallel_size": run["dp"], + "max_model_len": run["max_model_len"], + "gpu_memory_utilization": float(run["gpu_memory_utilization"]), + "dtype": run.get("dtype", "auto"), + "quantization": run.get("quantization"), + "kv_cache_dtype": run.get("kv_cache_dtype", "auto"), + }, + "timestamp": datetime.now(timezone.utc).isoformat(), + "log_path": str(log_path), + **parsed, + # planner_predicted and error_pct are added in a separate calibration step + } + if "_sweep_dim" in run: + record["_sweep_dim"] = run["_sweep_dim"] + + json_path.write_text(json.dumps(record, indent=2)) + print(f" JSON saved: {json_path}", flush=True) + + print(f"\nSweep complete. Results in {results_dir}", flush=True) + + +# ── Version extraction ──────────────────────────────────────────────────────── + +def _extract_version(vllm_image: str) -> str: + """Parse the version tag from a vLLM image string. + + Examples: + vllm/vllm-openai:v0.19.0 -> v0.19.0 + myregistry/vllm:latest -> latest + vllm-openai -> unknown + """ + if ":" in vllm_image: + return vllm_image.split(":")[-1] + print(f" Warning: no tag found in vllm_image '{vllm_image}', using 'unknown'", flush=True) + return "unknown" + + +def main() -> None: + ap = argparse.ArgumentParser() + ap.add_argument("--config", default="/sweep/sweep.yaml") + ap.add_argument("--results", default="/data/results/") + ap.add_argument("--dry-run", action="store_true") + args = ap.parse_args() + + config = yaml.safe_load(Path(args.config).read_text()) + version = _extract_version(config.get("defaults", {}).get("vllm_image", "")) + results_dir = Path(args.results) / version + runs = expand_matrix(config) + print(f"vLLM version: {version}", flush=True) + print(f"Results dir: {results_dir}", flush=True) + print(f"Matrix expanded: {len(runs)} runs", flush=True) + for r in runs: + print(f" {make_run_id(r)}", flush=True) + + if args.dry_run: + print("Dry run complete — no Jobs submitted.") + return + + run_sweep(runs, results_dir) + + +if __name__ == "__main__": + main() diff --git a/accuracy/tests/fixtures/llama_tp1_expected.json b/accuracy/tests/fixtures/llama_tp1_expected.json new file mode 100644 index 00000000..628e55a6 --- /dev/null +++ b/accuracy/tests/fixtures/llama_tp1_expected.json @@ -0,0 +1,11 @@ +{ + "weight_memory_gib": 14.99, + "kv_cache_memory_gib": 58.11, + "cuda_graph_memory_gib": 0.84, + "max_concurrency": 58.11, + "kv_cache_tokens": 476000, + "vllm_version": "v0.19.0", + "vllm_commit": "abc1234ef", + "kv_cache_blocks": 29750, + "kv_block_size_bytes": 2097315 +} diff --git a/accuracy/tests/test_analyze.py b/accuracy/tests/test_analyze.py new file mode 100644 index 00000000..9db7afa8 --- /dev/null +++ b/accuracy/tests/test_analyze.py @@ -0,0 +1,83 @@ +"""Unit tests for analyze.py — pure data, no GPU or cluster required.""" +import importlib.util +import json +import tempfile +from pathlib import Path + +import pytest + + +def _import_analyze(): + spec = importlib.util.spec_from_file_location( + "analyze", + Path(__file__).parents[2] / "accuracy/scripts/analyze.py", + ) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod + + +SAMPLE_RUN = { + "model": "meta-llama/Llama-3.1-8B-Instruct", + "gpu": "H100-80GB", + "tp": 1, "pp": 1, "dp": 1, "max_model_len": 8192, + "measured": { + "weight_memory_gib": 14.23, + "activation_memory_gib": 5.32, + "non_torch_memory_gib": 0.14, + "kv_cache_gib": 58.1, + }, + "planner_predicted": { + "weight_memory_gib": 14.23, + "activation_memory_gib": 5.6, + "non_torch_memory_gib": 0.15, + "kv_cache_gib": 57.6, + }, +} + + +@pytest.mark.unit +def test_compute_error_pct(): + analyze = _import_analyze() + errors = analyze.compute_error_pct(SAMPLE_RUN) + assert abs(errors["weight_memory"] - 0.0) < 0.01 + assert abs(errors["activation_memory"] - 5.26) < 0.1 + assert abs(errors["non_torch_memory"] - 7.14) < 0.1 + assert errors["kv_cache"] < 0 + + +@pytest.mark.unit +def test_load_runs_from_dir(): + analyze = _import_analyze() + with tempfile.TemporaryDirectory() as d: + (Path(d) / "run1.json").write_text(json.dumps(SAMPLE_RUN)) + (Path(d) / "run2.json").write_text(json.dumps({**SAMPLE_RUN, "tp": 2})) + (Path(d) / "metadata.txt").write_text("ignored") + runs = analyze.load_runs(d) + assert len(runs) == 2 + + +@pytest.mark.unit +def test_outlier_detection(): + analyze = _import_analyze() + run_ok = {**SAMPLE_RUN, "error_pct": { + "weight_memory": 0.0, "activation_memory": 5.0, + "non_torch_memory": 7.0, "kv_cache": -0.9}} + run_bad = {**SAMPLE_RUN, "model": "other/model", "error_pct": { + "weight_memory": 0.0, "activation_memory": 15.0, + "non_torch_memory": 7.0, "kv_cache": -0.9}} + outliers = analyze.find_outliers([run_ok, run_bad], threshold_pct=10.0) + assert len(outliers) == 1 + assert outliers[0]["model"] == "other/model" + + +@pytest.mark.unit +def test_markdown_report_contains_required_sections(): + analyze = _import_analyze() + run = {**SAMPLE_RUN, "error_pct": { + "weight_memory": 0.0, "activation_memory": 5.26, + "non_torch_memory": 7.14, "kv_cache": -0.86}} + report = analyze.generate_markdown_report([run]) + for section in ["## Per-component error", "## Per-architecture error", + "## Argument sensitivity", "## Outliers", "## Calibration decisions"]: + assert section in report, f"Missing section: {section}" diff --git a/accuracy/tests/test_collect.py b/accuracy/tests/test_collect.py new file mode 100644 index 00000000..e4619a79 --- /dev/null +++ b/accuracy/tests/test_collect.py @@ -0,0 +1,79 @@ +"""Unit tests for collect.py — no cluster or kubectl required.""" +import importlib.util +import json +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +FIXTURES = Path(__file__).parent / "fixtures" + + +def _import_collect(): + spec = importlib.util.spec_from_file_location( + "collect", + Path(__file__).parents[2] / "accuracy/scripts/collect.py", + ) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod + + +@pytest.mark.unit +def test_reader_pod_manifest(): + """Reader pod must mount the correct PVC, use busybox, and be named vllm-mem-reader.""" + collect = _import_collect() + manifest = collect._reader_pod_manifest(namespace="llmdplanner", pvc="vllm-mem-data") + assert manifest["metadata"]["name"] == "vllm-mem-reader" + assert manifest["metadata"]["namespace"] == "llmdplanner" + spec = manifest["spec"] + assert spec["containers"][0]["image"] == "busybox" + volumes = spec["volumes"] + pvc_vol = next(v for v in volumes if "persistentVolumeClaim" in v) + assert pvc_vol["persistentVolumeClaim"]["claimName"] == "vllm-mem-data" + + +@pytest.mark.unit +def test_resolve_version_from_arg(): + """Explicit --vllm-version wins over sweep.yaml.""" + collect = _import_collect() + version = collect._resolve_version( + vllm_version_arg="v0.99.0", + sweep_path=None, + ) + assert version == "v0.99.0" + + +@pytest.mark.unit +def test_resolve_version_from_sweep(tmp_path): + """When --vllm-version omitted, version is extracted from sweep.yaml vllm_image.""" + collect = _import_collect() + sweep = tmp_path / "sweep.yaml" + sweep.write_text("defaults:\n vllm_image: vllm/vllm-openai:v0.19.0\n") + version = collect._resolve_version(vllm_version_arg=None, sweep_path=str(sweep)) + assert version == "v0.19.0" + + +@pytest.mark.unit +def test_resolve_version_missing_raises(): + """No --vllm-version and no sweep.yaml must exit with a clear error.""" + collect = _import_collect() + with pytest.raises(SystemExit): + collect._resolve_version(vllm_version_arg=None, sweep_path=None) + + +@pytest.mark.unit +@patch("subprocess.run") +def test_dry_run_no_subprocess(mock_run, capsys, tmp_path): + """--dry-run must print paths and make zero subprocess.run calls.""" + collect = _import_collect() + collect.collect( + vllm_version="v0.19.0", + namespace="llmdplanner", + pvc="vllm-mem-data", + out=str(tmp_path), + dry_run=True, + ) + mock_run.assert_not_called() + out = capsys.readouterr().out + assert "v0.19.0" in out diff --git a/accuracy/tests/test_parse_log.py b/accuracy/tests/test_parse_log.py new file mode 100644 index 00000000..c792fc0f --- /dev/null +++ b/accuracy/tests/test_parse_log.py @@ -0,0 +1,74 @@ +"""Unit tests for parse_log.py — no GPU or cluster required.""" +import json +import os +import tempfile +from pathlib import Path + +import pytest + +FIXTURES = Path(__file__).parent / "fixtures" + + +def _import_parse(): + import importlib.util + spec = importlib.util.spec_from_file_location( + "parse_log", + Path(__file__).parents[2] / "accuracy/scripts/parse_log.py", + ) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod + + +@pytest.mark.unit +def test_parse_all_fields(): + """Fixture log must yield exact expected output.""" + parse_log = _import_parse() + result = parse_log.parse(FIXTURES / "llama_tp1_startup.log") + expected = json.loads((FIXTURES / "llama_tp1_expected.json").read_text()) + assert result == expected + + +@pytest.mark.unit +def test_parse_missing_required_field_raises(): + """Any missing required field must raise ValueError naming the field.""" + parse_log = _import_parse() + text = (FIXTURES / "llama_tp1_startup.log").read_text() + # Remove the "GPU KV cache size" line — kv_cache_tokens becomes missing + incomplete = "\n".join(l for l in text.splitlines() if "GPU KV cache size" not in l) + with tempfile.NamedTemporaryFile(mode="w", suffix=".log", delete=False) as f: + f.write(incomplete) + tmp = f.name + try: + with pytest.raises(ValueError, match="kv_cache_tokens"): + parse_log.parse(tmp) + finally: + os.unlink(tmp) + + +@pytest.mark.unit +def test_numeric_types(): + """Memory values must be float; block counts must be int.""" + parse_log = _import_parse() + result = parse_log.parse(FIXTURES / "llama_tp1_startup.log") + assert isinstance(result["weight_memory_gib"], float) + assert isinstance(result["kv_cache_memory_gib"], float) + assert isinstance(result["kv_cache_tokens"], int) + assert isinstance(result["kv_cache_blocks"], int) + assert isinstance(result["kv_block_size_bytes"], int) + + +@pytest.mark.unit +def test_parse_version_missing_is_nonfatal(): + """Missing version info is a warning, not an error — log may not have it.""" + parse_log = _import_parse() + text = (FIXTURES / "llama_tp1_startup.log").read_text() + no_version = "\n".join(l for l in text.splitlines() if "vLLM version" not in l) + with tempfile.NamedTemporaryFile(mode="w", suffix=".log", delete=False) as f: + f.write(no_version) + tmp = f.name + try: + result = parse_log.parse(tmp) # must not raise + assert "vllm_version" not in result or result["vllm_version"] is None + finally: + os.unlink(tmp) diff --git a/accuracy/tests/test_sweep_runner.py b/accuracy/tests/test_sweep_runner.py new file mode 100644 index 00000000..9d6ff00b --- /dev/null +++ b/accuracy/tests/test_sweep_runner.py @@ -0,0 +1,141 @@ +"""Unit tests for sweep_runner matrix expansion and orchestration — no K8s required.""" +import importlib.util +import re +import sys +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + + +def _import_runner(): + spec = importlib.util.spec_from_file_location( + "sweep_runner", + Path(__file__).parents[2] / "accuracy/scripts/sweep_runner.py", + ) + mod = importlib.util.module_from_spec(spec) + spec.loader.exec_module(mod) + return mod + + +MINIMAL_CONFIG = { + "defaults": { + "gpu": "H100-80GB", "max_model_len": 8192, + "pp": 1, "dp": 1, "gpu_memory_utilization": "0.90", + "namespace": "llmdplanner", "vllm_image": "vllm/vllm-openai:v0.19.0", + }, + "runs": [ + {"model": "org/model-7b", "tp": [1, 2, 4]}, + {"model": "org/model-70b", "tp": 2, "max_model_len": 16384}, + ], +} + + +@pytest.mark.unit +def test_expand_list_tp_into_three_runs(): + runner = _import_runner() + expanded = runner.expand_matrix(MINIMAL_CONFIG) + tp1_runs = [r for r in expanded if r["model"] == "org/model-7b"] + assert len(tp1_runs) == 3 + assert {r["tp"] for r in tp1_runs} == {1, 2, 4} + + +@pytest.mark.unit +def test_defaults_applied(): + runner = _import_runner() + for run in runner.expand_matrix(MINIMAL_CONFIG): + assert run["pp"] == 1 + assert run["dp"] == 1 + assert run["gpu"] == "H100-80GB" + assert run["namespace"] == "llmdplanner" + + +@pytest.mark.unit +def test_run_level_override_wins(): + runner = _import_runner() + expanded = runner.expand_matrix(MINIMAL_CONFIG) + overridden = [r for r in expanded if r["model"] == "org/model-70b"] + assert len(overridden) == 1 + assert overridden[0]["max_model_len"] == 16384 + + +@pytest.mark.unit +def test_run_ids_unique_and_deterministic(): + runner = _import_runner() + expanded = runner.expand_matrix(MINIMAL_CONFIG) + ids = [runner.make_run_id(r) for r in expanded] + assert len(ids) == len(set(ids)) + assert runner.make_run_id(expanded[0]) == runner.make_run_id(expanded[0]) + + +@pytest.mark.unit +def test_run_id_is_valid_k8s_name(): + """Run IDs must be lowercase alphanumeric+hyphens and ≤52 chars.""" + runner = _import_runner() + run = {"model": "meta-llama/Llama-3.1-8B-Instruct", + "gpu": "H100-80GB", "tp": 1, "pp": 1, "dp": 1, "max_model_len": 8192} + rid = runner.make_run_id(run) + assert re.match(r'^[a-z0-9][a-z0-9\-]*[a-z0-9]$', rid), f"Invalid K8s name: {rid}" + assert len(rid) <= 52 + + +@pytest.mark.unit +def test_run_sweep_skips_existing_results(tmp_path): + """run_sweep must skip a run if its JSON already exists.""" + runner = _import_runner() + run = {"model": "org/model-7b", "gpu": "H100-80GB", + "tp": 1, "pp": 1, "dp": 1, "max_model_len": 8192, + "namespace": "llmdplanner", "vllm_image": "v", "results_pvc": "pvc", + "gpu_memory_utilization": "0.90", + "node_selector": {}} + run_id = runner.make_run_id(run) + runs_dir = tmp_path / "runs" + runs_dir.mkdir() + (runs_dir / f"{run_id}.json").write_text('{"status":"done"}') + + mock_parse_log = MagicMock() + with patch.dict(sys.modules, {"parse_log": mock_parse_log}), \ + patch.object(runner, "_load_k8s", return_value=(MagicMock(), MagicMock())), \ + patch.object(runner, "_submit_sub_job") as mock_submit: + runner.run_sweep([run], tmp_path) + mock_submit.assert_not_called() + + +@pytest.mark.unit +def test_extract_version_standard(): + runner = _import_runner() + assert runner._extract_version("vllm/vllm-openai:v0.19.0") == "v0.19.0" + + +@pytest.mark.unit +def test_extract_version_no_tag(): + runner = _import_runner() + assert runner._extract_version("vllm-openai") == "unknown" + + +@pytest.mark.unit +def test_versioned_subfolders(tmp_path): + """run_sweep creates //logs/ and //runs/.""" + runner = _import_runner() + version = "v0.19.0" + results_dir = tmp_path / version + logs_dir = results_dir / "logs" + runs_dir = results_dir / "runs" + logs_dir.mkdir(parents=True) + runs_dir.mkdir(parents=True) + assert logs_dir.is_dir() + assert runs_dir.is_dir() + + +@pytest.mark.unit +def test_build_job_manifest_sets_correct_gpu_count(): + runner = _import_runner() + run = {"model": "org/model", "gpu": "H100-80GB", + "tp": 4, "pp": 2, "dp": 1, "max_model_len": 8192, + "namespace": "llmdplanner", "vllm_image": "vllm/vllm-openai:v0.19.0", + "results_pvc": "vllm-mem-data", "gpu_memory_utilization": "0.90", + "node_selector": {"nvidia.com/gpu.product": "NVIDIA-H100-80GB-HBM3"}} + manifest = runner._build_job_manifest(runner.make_run_id(run), run) + container = manifest["spec"]["template"]["spec"]["containers"][0] + # TP=4 × PP=2 = 8 GPUs + assert container["resources"]["limits"]["nvidia.com/gpu"] == 8 From 6417620b546c621155b3f2814b994f7abd6d18e8 Mon Sep 17 00:00:00 2001 From: Jing Chen Date: Wed, 22 Apr 2026 11:10:27 -0400 Subject: [PATCH 02/24] Initial results Signed-off-by: Jing Chen --- accuracy/README.md | 160 + accuracy/results/v0.19.0/accuracy_report.md | 289 ++ accuracy/results/v0.19.0/deep_analysis.md | 333 -- ...epseek-v2---h100-80gb--tp1pp1dp1--8192.log | 787 +++++ ...anite-3-3---h100-80gb--tp1pp1dp1--8192.log | 746 +++++ ...anite-visi--h100-80gb--tp1pp1dp1--8192.log | 750 +++++ ...100-80gb--tp1pp1dp1--8192-dtbf16-kvfp8.log | 751 +++++ ...ma---h100-80gb--tp1pp1dp1--8192-dtbf16.log | 749 +++++ ...ma-3--h100-80gb--tp1pp1dp1--8192-dtf16.log | 746 +++++ ...ma-3-1-8b---h100-80gb--tp1pp1dp1--2048.log | 746 +++++ ...ma-3-1-8b---h100-80gb--tp1pp1dp1--4096.log | 747 +++++ ...ma-3-1-8b---h100-80gb--tp1pp1dp1--8192.log | 746 +++++ ...8b---h100-80gb--tp1pp1dp2--8192.FAILED.log | 370 +++ ...ma-3-1-8b---h100-80gb--tp1pp2dp1--8192.log | 1398 ++++++++ ...ma-3-1-8b---h100-80gb--tp1pp4dp1--8192.log | 2627 +++++++++++++++ ...ma-3-1-8b---h100-80gb--tp2pp1dp1--8192.log | 1438 +++++++++ ...ma-3-1-8b---h100-80gb--tp4pp1dp1--8192.log | 2768 ++++++++++++++++ ...ma-3-1-8b--h100-80gb--tp1pp1dp1--32768.log | 747 +++++ ...soft-phi-4--h100-80gb--tp1pp1dp1--8192.log | 752 +++++ ...ral-small---h100-80gb--tp1pp1dp1--8192.log | 919 ++++++ ...ral-8x7b-i--h100-80gb--tp2pp1dp1--8192.log | 1473 +++++++++ ...mi-dev-72b--h100-80gb--tp2pp1dp1--8192.log | 1565 +++++++++ ...72b--h100-80gb--tp4pp1dp1--8192.FAILED.log | 399 +++ ...mi-dev-72b--h100-80gb--tp4pp1dp1--8192.log | 2854 +++++++++++++++++ ...i-vl-a3b-i--h100-80gb--tp1pp1dp1--8192.log | 1126 +++++++ ...i-vl-a3b-i--h100-80gb--tp2pp1dp1--8192.log | 2188 +++++++++++++ ...100-80gb--tp1pp1dp1--8192-dtbf16-kvfp8.log | 750 +++++ ...2b-instruc--h100-80gb--tp2pp1dp1--8192.log | 1523 +++++++++ ...b-i--h100-80gb--tp1pp1dp1--8192-dtbf16.log | 746 +++++ ...b-instruc--h100-80gb--tp1pp1dp1--16384.log | 747 +++++ ...b-instruc--h100-80gb--tp1pp1dp1--32768.log | 747 +++++ ...b-instruct--h100-80gb--tp1pp1dp1--2048.log | 746 +++++ ...b-instruct--h100-80gb--tp1pp1dp1--4096.log | 746 +++++ ...b-instruct--h100-80gb--tp1pp1dp1--8192.log | 749 +++++ ...b-instruct--h100-80gb--tp2pp1dp1--8192.log | 1437 +++++++++ ...b-instruct--h100-80gb--tp4pp1dp1--8192.log | 2766 ++++++++++++++++ ...14b--h100-80gb--tp5pp1dp1--8192.FAILED.log | 675 ++++ ...n3-30b-a3b--h100-80gb--tp1pp1dp1--8192.log | 774 +++++ ...n-qwen3-8b--h100-80gb--tp1pp1dp1--8192.log | 751 +++++ ...-3-3-70b-i--h100-80gb--tp2pp1dp1--8192.log | 2144 +++++++++++++ ...-lla--h100-80gb--tp1pp1dp1--8192-dtf16.log | 745 +++++ ...llam--h100-80gb--tp1pp1dp1--8192-dtf16.log | 878 +++++ .../results/v0.19.0/parameter_sensitivity.md | 217 -- accuracy/results/v0.19.0/report.md | 234 +- accuracy/results/v0.19.0/results.csv | 65 - .../results/v0.19.0/results_predicted.csv | 48 + accuracy/results/v0.19.0/results_raw.csv | 59 + ...pseek-v2---h100-80gb--tp1pp1dp1--8192.json | 2 +- ...pseek-v2---h100-80gb--tp2pp1dp1--8192.json | 25 - ...pseek-v2---h100-80gb--tp4pp1dp1--8192.json | 25 - ...a-3-3-70b--h100-80gb--tp2pp1dp1--8192.json | 2 +- ...a-3-3-70b--h100-80gb--tp4pp1dp1--8192.json | 2 +- ...-qwen2-5---h100-80gb--tp1pp1dp1--8192.json | 26 + ...e-3-1-2b--h100-80gb--tp1pp1dp1--8192.json} | 4 +- ...te-3-1-8b--h100-80gb--tp1pp1dp1--8192.json | 25 + ...nite-3-1---h100-80gb--tp2pp1dp1--8192.json | 25 - ...nite-3-1---h100-80gb--tp4pp1dp1--8192.json | 25 - ...nite-3-3---h100-80gb--tp1pp1dp1--8192.json | 2 +- ...nite-3-3---h100-80gb--tp2pp1dp1--8192.json | 25 - ...nite-3-3---h100-80gb--tp4pp1dp1--8192.json | 25 - ...nite-visi--h100-80gb--tp1pp1dp1--8192.json | 2 +- ...nite-visi--h100-80gb--tp2pp1dp1--8192.json | 25 - ...00-80gb--tp1pp1dp1--8192-dtbf16-kvfp8.json | 2 +- ...100-80gb--tp1pp1dp1--8192-dtf16-kvfp8.json | 4 - ...a---h100-80gb--tp1pp1dp1--8192-dtbf16.json | 2 +- ...a-3---h100-80gb--tp1pp1dp1--8192-qfp8.json | 26 + ...a-3--h100-80gb--tp1pp1dp1--8192-dtf16.json | 4 +- ...a-3--h100-80gb--tp1pp1dp1--8192-dtf32.json | 2 +- ...a-3-1-8b---h100-80gb--tp1pp1dp1--2048.json | 2 +- ...a-3-1-8b---h100-80gb--tp1pp1dp1--4096.json | 2 +- ...a-3-1-8b---h100-80gb--tp1pp1dp1--8192.json | 9 +- ...a-3-1-8b---h100-80gb--tp1pp2dp1--8192.json | 2 +- ...a-3-1-8b---h100-80gb--tp1pp4dp1--8192.json | 2 +- ...a-3-1-8b---h100-80gb--tp2pp1dp1--8192.json | 5 +- ...a-3-1-8b---h100-80gb--tp3pp1dp1--8192.json | 4 - ...a-3-1-8b---h100-80gb--tp4pp1dp1--8192.json | 5 +- ...a-3-1-8b--h100-80gb--tp1pp1dp1--16384.json | 27 +- ...a-3-1-8b--h100-80gb--tp1pp1dp1--32768.json | 2 +- ...a-4-scout--h100-80gb--tp1pp1dp1--8192.json | 1 - ...a-4-scout--h100-80gb--tp2pp1dp1--8192.json | 1 - ...a-4-scout--h100-80gb--tp4pp1dp1--8192.json | 25 - ...oft-phi-4--h100-80gb--tp1pp1dp1--8192.json | 2 +- ...oft-phi-4--h100-80gb--tp2pp1dp1--8192.json | 25 - ...oft-phi-4--h100-80gb--tp4pp1dp1--8192.json | 1 - ...al-small---h100-80gb--tp1pp1dp1--8192.json | 2 +- ...al-small---h100-80gb--tp2pp1dp1--8192.json | 25 - ...al-small---h100-80gb--tp4pp1dp1--8192.json | 25 - ...al-8x7b-i--h100-80gb--tp1pp1dp1--8192.json | 1 - ...al-8x7b-i--h100-80gb--tp2pp1dp1--8192.json | 2 +- ...al-8x7b-i--h100-80gb--tp4pp1dp1--8192.json | 25 - ...i-dev-72b--h100-80gb--tp2pp1dp1--8192.json | 2 +- ...i-dev-72b--h100-80gb--tp4pp1dp1--8192.json | 5 +- ...-vl-a3b-i--h100-80gb--tp1pp1dp1--8192.json | 2 +- ...-vl-a3b-i--h100-80gb--tp2pp1dp1--8192.json | 2 +- ...-oss-120b--h100-80gb--tp4pp1dp1--8192.json | 1 - ...-oss-120b--h100-80gb--tp8pp1dp1--8192.json | 1 - ...t-oss-20b--h100-80gb--tp4pp1dp1--8192.json | 25 - ...n-7b-chat--h100-80gb--tp1pp1dp1--8192.json | 1 - ...n-7b-chat--h100-80gb--tp2pp1dp1--8192.json | 1 - ...n-7b-chat--h100-80gb--tp4pp1dp1--8192.json | 1 - ...00-80gb--tp1pp1dp1--8192-dtbf16-kvfp8.json | 2 +- ...100-80gb--tp1pp1dp1--8192-dtf16-kvfp8.json | 4 - ...b-instruc--h100-80gb--tp2pp1dp1--8192.json | 2 +- ...b-instruc--h100-80gb--tp4pp1dp1--8192.json | 25 - ...b-instruc--h100-80gb--tp8pp1dp1--8192.json | 1 - ...-i--h100-80gb--tp1pp1dp1--8192-dtbf16.json | 2 +- ...-in--h100-80gb--tp1pp1dp1--8192-dtf16.json | 26 - ...-instruc--h100-80gb--tp1pp1dp1--16384.json | 2 +- ...-instruc--h100-80gb--tp1pp1dp1--32768.json | 2 +- ...-instruct--h100-80gb--tp1pp1dp1--2048.json | 2 +- ...-instruct--h100-80gb--tp1pp1dp1--4096.json | 2 +- ...-instruct--h100-80gb--tp1pp1dp1--8192.json | 2 +- ...-instruct--h100-80gb--tp2pp1dp1--8192.json | 5 +- ...-instruct--h100-80gb--tp4pp1dp1--8192.json | 5 +- ...3-30b-a3b--h100-80gb--tp1pp1dp1--8192.json | 2 +- ...3-30b-a3b--h100-80gb--tp2pp1dp1--8192.json | 1 - ...3-30b-a3b--h100-80gb--tp4pp1dp1--8192.json | 25 - ...-qwen3-8b--h100-80gb--tp1pp1dp1--8192.json | 2 +- ...-qwen3-8b--h100-80gb--tp2pp1dp1--8192.json | 1 - ...-qwen3-8b--h100-80gb--tp4pp1dp1--8192.json | 25 - ...h100-80gb--tp1pp1dp1--8192-dtf16-qawq.json | 4 - ...h100-80gb--tp1pp1dp1--8192-dtf16-qfp8.json | 4 - ...3-1--h100-80gb--tp1pp1dp1--8192-dtf16.json | 26 - ...3-3-70b-i--h100-80gb--tp1pp1dp1--8192.json | 25 - ...3-3-70b-i--h100-80gb--tp2pp1dp1--8192.json | 5 +- ...3-3-70b-i--h100-80gb--tp4pp1dp1--8192.json | 25 - ...lam--h100-80gb--tp1pp1dp1--8192-dtf16.json | 26 - ...lla--h100-80gb--tp1pp1dp1--8192-dtf16.json | 2 +- ...small-24b--h100-80gb--tp1pp1dp1--8192.json | 2 +- ...small-24b--h100-80gb--tp2pp1dp1--8192.json | 2 +- ...lam--h100-80gb--tp1pp1dp1--8192-dtf16.json | 8 +- ...wen2-5-7b--h100-80gb--tp1pp1dp1--8192.json | 2 +- accuracy/scripts/analyze.py | 779 +++-- accuracy/scripts/deep_analysis.py | 397 --- accuracy/scripts/parse_log.py | 20 + accuracy/scripts/parse_logs.py | 206 ++ accuracy/scripts/predict_capacity.py | 185 ++ accuracy/scripts/sweep.yaml | 259 +- accuracy/scripts/sweep_runner.py | 132 +- 139 files changed, 46380 insertions(+), 2104 deletions(-) create mode 100644 accuracy/README.md create mode 100644 accuracy/results/v0.19.0/accuracy_report.md delete mode 100644 accuracy/results/v0.19.0/deep_analysis.md create mode 100644 accuracy/results/v0.19.0/logs/deepseek-ai-deepseek-v2---h100-80gb--tp1pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/ibm-granite-granite-3-3---h100-80gb--tp1pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/ibm-granite-granite-visi--h100-80gb--tp1pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/meta-llama---h100-80gb--tp1pp1dp1--8192-dtbf16-kvfp8.log create mode 100644 accuracy/results/v0.19.0/logs/meta-llama-llama---h100-80gb--tp1pp1dp1--8192-dtbf16.log create mode 100644 accuracy/results/v0.19.0/logs/meta-llama-llama-3--h100-80gb--tp1pp1dp1--8192-dtf16.log create mode 100644 accuracy/results/v0.19.0/logs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp1dp1--2048.log create mode 100644 accuracy/results/v0.19.0/logs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp1dp1--4096.log create mode 100644 accuracy/results/v0.19.0/logs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp1dp2--8192.FAILED.log create mode 100644 accuracy/results/v0.19.0/logs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp2dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp4dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/meta-llama-llama-3-1-8b---h100-80gb--tp2pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/meta-llama-llama-3-1-8b---h100-80gb--tp4pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/meta-llama-llama-3-1-8b--h100-80gb--tp1pp1dp1--32768.log create mode 100644 accuracy/results/v0.19.0/logs/microsoft-phi-4--h100-80gb--tp1pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/mistralai-mistral-small---h100-80gb--tp1pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/mistralai-mixtral-8x7b-i--h100-80gb--tp2pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/moonshotai-kimi-dev-72b--h100-80gb--tp2pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/moonshotai-kimi-dev-72b--h100-80gb--tp4pp1dp1--8192.FAILED.log create mode 100644 accuracy/results/v0.19.0/logs/moonshotai-kimi-dev-72b--h100-80gb--tp4pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/moonshotai-kimi-vl-a3b-i--h100-80gb--tp1pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/moonshotai-kimi-vl-a3b-i--h100-80gb--tp2pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/qwen-qwen2---h100-80gb--tp1pp1dp1--8192-dtbf16-kvfp8.log create mode 100644 accuracy/results/v0.19.0/logs/qwen-qwen2-5-72b-instruc--h100-80gb--tp2pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/qwen-qwen2-5-7b-i--h100-80gb--tp1pp1dp1--8192-dtbf16.log create mode 100644 accuracy/results/v0.19.0/logs/qwen-qwen2-5-7b-instruc--h100-80gb--tp1pp1dp1--16384.log create mode 100644 accuracy/results/v0.19.0/logs/qwen-qwen2-5-7b-instruc--h100-80gb--tp1pp1dp1--32768.log create mode 100644 accuracy/results/v0.19.0/logs/qwen-qwen2-5-7b-instruct--h100-80gb--tp1pp1dp1--2048.log create mode 100644 accuracy/results/v0.19.0/logs/qwen-qwen2-5-7b-instruct--h100-80gb--tp1pp1dp1--4096.log create mode 100644 accuracy/results/v0.19.0/logs/qwen-qwen2-5-7b-instruct--h100-80gb--tp1pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/qwen-qwen2-5-7b-instruct--h100-80gb--tp2pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/qwen-qwen2-5-7b-instruct--h100-80gb--tp4pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/qwen-qwen3-14b--h100-80gb--tp5pp1dp1--8192.FAILED.log create mode 100644 accuracy/results/v0.19.0/logs/qwen-qwen3-30b-a3b--h100-80gb--tp1pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/qwen-qwen3-8b--h100-80gb--tp1pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/redhatai-llama-3-3-70b-i--h100-80gb--tp2pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/w4a16-redhatai-lla--h100-80gb--tp1pp1dp1--8192-dtf16.log create mode 100644 accuracy/results/v0.19.0/logs/w8a8-redhatai-llam--h100-80gb--tp1pp1dp1--8192-dtf16.log delete mode 100644 accuracy/results/v0.19.0/parameter_sensitivity.md delete mode 100644 accuracy/results/v0.19.0/results.csv create mode 100644 accuracy/results/v0.19.0/results_predicted.csv create mode 100644 accuracy/results/v0.19.0/results_raw.csv delete mode 100644 accuracy/results/v0.19.0/runs/deepseek-ai-deepseek-v2---h100-80gb--tp2pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/deepseek-ai-deepseek-v2---h100-80gb--tp4pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/fp8dyn-redhatai-qwen2-5---h100-80gb--tp1pp1dp1--8192.json rename accuracy/results/v0.19.0/runs/{ibm-granite-granite-3-1---h100-80gb--tp1pp1dp1--8192.json => granite-3-1-2b--h100-80gb--tp1pp1dp1--8192.json} (79%) create mode 100644 accuracy/results/v0.19.0/runs/granite-3-1-8b--h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/ibm-granite-granite-3-1---h100-80gb--tp2pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/ibm-granite-granite-3-1---h100-80gb--tp4pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/ibm-granite-granite-3-3---h100-80gb--tp2pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/ibm-granite-granite-3-3---h100-80gb--tp4pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/ibm-granite-granite-visi--h100-80gb--tp2pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/meta-llama-l--h100-80gb--tp1pp1dp1--8192-dtf16-kvfp8.json create mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-3---h100-80gb--tp1pp1dp1--8192-qfp8.json delete mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp3pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-4-scout--h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-4-scout--h100-80gb--tp2pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-4-scout--h100-80gb--tp4pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/microsoft-phi-4--h100-80gb--tp2pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/microsoft-phi-4--h100-80gb--tp4pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/mistralai-mistral-small---h100-80gb--tp2pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/mistralai-mistral-small---h100-80gb--tp4pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/mistralai-mixtral-8x7b-i--h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/mistralai-mixtral-8x7b-i--h100-80gb--tp4pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/openai-gpt-oss-120b--h100-80gb--tp4pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/openai-gpt-oss-120b--h100-80gb--tp8pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/openai-gpt-oss-20b--h100-80gb--tp4pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen-7b-chat--h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen-7b-chat--h100-80gb--tp2pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen-7b-chat--h100-80gb--tp4pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen2-5--h100-80gb--tp1pp1dp1--8192-dtf16-kvfp8.json delete mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen2-5-72b-instruc--h100-80gb--tp4pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen2-5-72b-instruc--h100-80gb--tp8pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-in--h100-80gb--tp1pp1dp1--8192-dtf16.json delete mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen3-30b-a3b--h100-80gb--tp2pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen3-30b-a3b--h100-80gb--tp4pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen3-8b--h100-80gb--tp2pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen3-8b--h100-80gb--tp4pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/redhatai-llam--h100-80gb--tp1pp1dp1--8192-dtf16-qawq.json delete mode 100644 accuracy/results/v0.19.0/runs/redhatai-llam--h100-80gb--tp1pp1dp1--8192-dtf16-qfp8.json delete mode 100644 accuracy/results/v0.19.0/runs/redhatai-llama-3-1--h100-80gb--tp1pp1dp1--8192-dtf16.json delete mode 100644 accuracy/results/v0.19.0/runs/redhatai-llama-3-3-70b-i--h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/redhatai-llama-3-3-70b-i--h100-80gb--tp4pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/redhatai-meta-llam--h100-80gb--tp1pp1dp1--8192-dtf16.json delete mode 100644 accuracy/scripts/deep_analysis.py create mode 100644 accuracy/scripts/parse_logs.py create mode 100644 accuracy/scripts/predict_capacity.py diff --git a/accuracy/README.md b/accuracy/README.md new file mode 100644 index 00000000..eba6f212 --- /dev/null +++ b/accuracy/README.md @@ -0,0 +1,160 @@ +# Reproducing the vLLM Memory Validation Campaign + +This guide is self-contained. No knowledge of this codebase is assumed. + +## What this does + +A single Kubernetes Job (the **orchestrator**) runs inside your cluster and drives the full +sweep. For each entry in `sweep.yaml`, it creates a sub-job that starts `vllm serve`, waits +for the startup probe to pass (vLLM is ready), fetches the full startup log, deletes the job, +and saves a JSON result. Sub-jobs run sequentially so only one GPU workload runs at a time. + +## Prerequisites + +| Tool | Min version | Install | +|---|---|---| +| kubectl | 1.28 | https://kubernetes.io/docs/tasks/tools/ | +| Kubernetes cluster | 1.28 | with NVIDIA GPU Operator | +| HuggingFace account | — | https://huggingface.co (gated models need approval) | + +GPU requirement: at least 1× H100 80GB (or A100 80GB). 70B model runs need TP=2+ (2+ GPUs). + +## Step 1: Clone + +```bash +git clone https://github.com/llm-d-incubation/llm-d-planner +cd llm-d-planner +``` + +## Step 2: Find your cluster's GPU node label + +```bash +kubectl get nodes --show-labels | grep -i gpu +``` + +Open `accuracy/scripts/sweep.yaml` and update `defaults.node_selector` to match. +The comment in the file explains what to change. + +## Step 3: Apply infrastructure (one-time) + +```bash +kubectl apply -f accuracy/k8s/namespace.yaml +kubectl apply -f accuracy/k8s/rbac.yaml +kubectl apply -f accuracy/k8s/pvc.yaml + +# Create HuggingFace token Secret (replace with your token): +kubectl create secret generic hf-token \ + --from-literal=token=hf_YOUR_TOKEN_HERE \ + --namespace llmdplanner +``` + +## Step 4: Sync ConfigMaps from scripts + +Run these from the repo root to push the latest Python scripts and sweep config into the cluster: + +```bash +kubectl create configmap vllm-mem-scripts \ + --from-file=sweep_runner.py=accuracy/scripts/sweep_runner.py \ + --from-file=parse_log.py=accuracy/scripts/parse_log.py \ + --namespace llmdplanner --dry-run=client -o yaml | kubectl apply -f - + +kubectl create configmap vllm-mem-sweep \ + --from-file=sweep.yaml=accuracy/scripts/sweep.yaml \ + --namespace llmdplanner --dry-run=client -o yaml | kubectl apply -f - +``` + +Re-run this step any time you edit `sweep.yaml` or the Python scripts. + +## Step 5: Smoke test with one run + +Edit `sweep.yaml` to contain only one run entry (e.g., `Llama-3.1-8B-Instruct`, TP=1), +sync the ConfigMap (Step 4), then: + +```bash +kubectl apply -f accuracy/k8s/orchestrator-job.yaml +kubectl logs -f job/vllm-mem-orchestrator -n llmdplanner +``` + +Expected: the orchestrator log shows `Submitted`, `Waiting for pod ready`, `Pod ready`, +`Log saved`, `JSON saved`, `Sweep complete`. Takes ~5-10 minutes. + +Fetch the result: +```bash +# Exec into a pod that mounts the PVC, or copy via a reader pod: +kubectl run reader --image=busybox --restart=Never --rm -it \ + --overrides='{"spec":{"volumes":[{"name":"data","persistentVolumeClaim":{"claimName":"vllm-mem-data"}}],"containers":[{"name":"reader","image":"busybox","volumeMounts":[{"name":"data","mountPath":"/data"}]}]}}' \ + -n llmdplanner -- ls /data/results/ +``` + +## Step 6: Verify log patterns + +The patterns in `accuracy/scripts/parse_log.py` are validated against real vLLM v0.19.0 +logs. After the smoke test, confirm the JSON result has non-null `weight_memory_gib` and +`kv_cache_memory_gib`. If a future vLLM version changes log format, update the regex +patterns, re-run `uv run pytest accuracy/tests/ -q`, and re-sync ConfigMaps before proceeding. + +## Step 7: Run the full sweep + +Restore `sweep.yaml` to the full matrix, sync ConfigMaps, delete the previous orchestrator +Job, and resubmit: + +```bash +kubectl delete job vllm-mem-orchestrator -n llmdplanner --ignore-not-found +kubectl apply -f accuracy/k8s/orchestrator-job.yaml +kubectl logs -f job/vllm-mem-orchestrator -n llmdplanner +``` + +Estimated time: 8–16 hours depending on model download speed and GPU availability. +The orchestrator is restartable: if it fails mid-sweep, already-saved JSON files are skipped +on resubmit. + +## Step 8: Collect results and generate the report + +Pull logs and JSON results from the cluster PVC to your local machine: + +```bash +python accuracy/scripts/collect.py +# Results land in: data/benchmarks/memory/v0.19.0/runs/ and .../logs/ +``` + +Copy the new run JSONs into `accuracy/results/v0.19.0/runs/`, then generate the report. +`analyze.py` calls the capacity planner directly to compute predictions — no separate +calibration step needed. For gated models pass `--hf-token ` (only fetches +`config.json`, not model weights): + +```bash +python accuracy/scripts/analyze.py \ + --runs accuracy/results/v0.19.0/runs/ \ + --out accuracy/results/v0.19.0/report.md \ + --csv accuracy/results/v0.19.0/results.csv + +python accuracy/scripts/deep_analysis.py \ + --csv accuracy/results/v0.19.0/results.csv \ + --out accuracy/results/v0.19.0/deep_analysis.md +``` + +## Reproducing from committed results (no cluster needed) + +The raw run JSONs are committed in `accuracy/results/v0.19.0/runs/`. To regenerate +the report and analysis locally: + +```bash +uv run python accuracy/scripts/analyze.py \ + --runs accuracy/results/v0.19.0/runs/ \ + --out accuracy/results/v0.19.0/report.md \ + --csv accuracy/results/v0.19.0/results.csv + +uv run python accuracy/scripts/deep_analysis.py \ + --csv accuracy/results/v0.19.0/results.csv \ + --out accuracy/results/v0.19.0/deep_analysis.md +``` + +## Troubleshooting + +| Symptom | Likely cause | Fix | +|---|---|---| +| Sub-job pod stays Pending | Wrong node_selector or insufficient GPU quota | `kubectl describe pod -n llmdplanner -l app=vllm-mem-validation` | +| Startup probe never passes | vLLM OOM or model too large for single GPU | Check pod log for `cudaMalloc failed`; increase TP | +| Parse fails: missing field | Log format differs from patterns | Review log, update patterns in `parse_log.py`, re-sync ConfigMap | +| HF 401 error in pod log | Token missing or no access to gated model | Re-create `hf-token` Secret; request model access on HuggingFace | +| Orchestrator exits early | One sub-job failed | Check `kubectl logs job/vllm-mem-orchestrator`; resubmit after fix | diff --git a/accuracy/results/v0.19.0/accuracy_report.md b/accuracy/results/v0.19.0/accuracy_report.md new file mode 100644 index 00000000..615483d6 --- /dev/null +++ b/accuracy/results/v0.19.0/accuracy_report.md @@ -0,0 +1,289 @@ +# Capacity Planner Accuracy Report — vLLM v0.19.0 / H100-80GB + +**Dataset**: 47 successful runs across 22 unique models +**Hardware**: H100-80GB (catalog memory = 80 GiB, actual = ~79.19 GiB) +**Planner GPU util**: actual `gpu_memory_utilization` per run (0.95) + +## Executive Summary + +| Metric | Mean error | Mean abs error | Notes | +|--------|:----------:|:--------------:|-------| +| **KV Cache memory** (all 47 runs) | +0.83% | +7.91% | | +| **KV Cache memory** (baseline: tp=pp=1, len=8192, no-quant) | -4.11% | — | n=16 | +| **Weight memory** | -0.03% | +3.27% | From safetensors metadata | +| **Activation memory** | +196.08% | +196.08% | Largest error source | +| **Non-torch overhead** | -43.67% | +53.40% | | +| **Max concurrency** | -1.57% | +15.90% | Proxy for KV cache accuracy | + +### Key Findings + +1. **Weights are accurate** — mean abs error +3.27%, computed directly from safetensors parameter counts. Errors arise only when `--dtype` overrides the native dtype (e.g., `--dtype float32`) or when quantization is not fully captured in the config. +2. **Activation is the dominant error source** — mean +196.08% (over-estimate). The planner uses empirical constants (4.8–8.0 GiB) measured at `max_model_len=16000`; vLLM v0.19.0 reports 0.75–2.2 GiB across all architectures tested. Granite is worst (+600%), Mistral3/Pixtral is best (+15–23%). +3. **Over-estimated activation partially cancels** the catalog GPU memory inflation (+0.77 GiB), leaving KV cache only +0.83% off on average across all runs. But this is coincidental cancellation of two large opposing errors, not model accuracy. +4. **Non-default KV dtype (`--kv-cache-dtype fp8`) doubles token capacity** but the planner ignores this flag — KV token count is off by ~2× for those runs. +5. **`--dtype float32` breaks weight prediction** — the planner uses the HuggingFace config dtype (BF16) and never sees the vLLM `--dtype` override, giving −50% weight error. +6. **Pipeline parallelism reduces actual activation** (each GPU processes fewer layers) but the formula uses the same constant regardless of PP, compounding the activation error. + +## Component-Level Error Breakdown + +> Percent error = (predicted − actual) / actual × 100. Positive = over-estimate, negative = under-estimate. + + +### All 47 Runs (n=47) + +| Component | Mean error | Median | Mean abs | Min | Max | n | +|-----------|:----------:|:------:|:--------:|:---:|:---:|:-:| +| Weight | -0.03% | -0.31% | +3.27% | -50.11% | +76.18% | 47 | +| Activation | +196.08% | +153.97% | +196.08% | +14.68% | +633.33% | 47 | +| Non Torch | -43.67% | -40.00% | +53.40% | -72.85% | +114.29% | 47 | +| Cuda Graph | -100.00% | -100.00% | +100.00% | -100.00% | -100.00% | 47 | +| Total Non Kv | +19.52% | +16.70% | +21.45% | -38.61% | +97.49% | 47 | +| Kv Cache | +0.83% | -3.47% | +7.91% | -28.75% | +61.82% | 47 | +| Max Concurrency | -1.57% | -3.48% | +15.90% | -87.31% | +162.10% | 47 | + +### Baseline: TP=1, PP=1, len=8192, no quantization, default KV dtype (n=16) + +| Component | Mean error | Median | Mean abs | Min | Max | n | +|-----------|:----------:|:------:|:--------:|:---:|:---:|:-:| +| Weight | -3.38% | -0.22% | +3.39% | -50.11% | +0.04% | 16 | +| Activation | +247.20% | +163.97% | +247.20% | +23.15% | +633.33% | 16 | +| Non Torch | -45.25% | -40.00% | +45.25% | -67.39% | -37.50% | 16 | +| Cuda Graph | -100.00% | -100.00% | +100.00% | -100.00% | -100.00% | 16 | +| Total Non Kv | +17.06% | +17.29% | +21.89% | -38.61% | +74.27% | 16 | +| Kv Cache | -4.11% | -4.29% | +8.18% | -28.75% | +31.06% | 16 | +| Max Concurrency | -0.76% | -4.29% | +21.22% | -87.31% | +162.10% | 16 | + +### Multi-GPU (TP > 1 or PP > 1) (n=15) + +| Component | Mean error | Median | Mean abs | Min | Max | n | +|-----------|:----------:|:------:|:--------:|:---:|:---:|:-:| +| Weight | -1.20% | -0.38% | +1.20% | -12.22% | -0.03% | 15 | +| Activation | +196.02% | +153.39% | +196.02% | +23.15% | +561.16% | 15 | +| Non Torch | -46.75% | -71.29% | +77.23% | -72.85% | +114.29% | 15 | +| Cuda Graph | -100.00% | -100.00% | +100.00% | -100.00% | -100.00% | 15 | +| Total Non Kv | +16.86% | +11.34% | +17.76% | -6.76% | +97.49% | 15 | +| Kv Cache | +11.26% | +4.62% | +11.63% | -1.88% | +61.82% | 15 | +| Max Concurrency | +6.58% | +4.63% | +16.32% | -71.19% | +62.24% | 15 | + +### Quantized Models (fp8-dynamic / w8a8 / w4a16) (n=10) + +| Component | Mean error | Median | Mean abs | Min | Max | n | +|-----------|:----------:|:------:|:--------:|:---:|:---:|:-:| +| Weight | +7.29% | -0.29% | +7.94% | -0.79% | +76.18% | 10 | +| Activation | +124.00% | +149.15% | +124.00% | +14.68% | +153.97% | 10 | +| Non Torch | -52.67% | -41.15% | +52.67% | -72.85% | -37.50% | 10 | +| Cuda Graph | -100.00% | -100.00% | +100.00% | -100.00% | -100.00% | 10 | +| Total Non Kv | +21.87% | +15.87% | +23.22% | -6.76% | +87.45% | 10 | +| Kv Cache | -0.45% | -0.87% | +4.95% | -13.18% | +5.90% | 10 | +| Max Concurrency | -0.44% | -0.86% | +4.95% | -13.19% | +5.89% | 10 | + +### Non-default KV cache dtype (--kv-cache-dtype fp8) (n=2) + +| Component | Mean error | Median | Mean abs | Min | Max | n | +|-----------|:----------:|:------:|:--------:|:---:|:---:|:-:| +| Weight | -0.34% | -0.34% | +0.34% | -0.45% | -0.22% | 2 | +| Activation | +153.68% | +153.68% | +153.68% | +153.39% | +153.97% | 2 | +| Non Torch | -38.75% | -38.75% | +38.75% | -40.00% | -37.50% | 2 | +| Cuda Graph | -100.00% | -100.00% | +100.00% | -100.00% | -100.00% | 2 | +| Total Non Kv | +17.83% | +17.83% | +17.83% | +16.28% | +19.37% | 2 | +| Kv Cache | -3.84% | -3.84% | +3.84% | -4.21% | -3.47% | 2 | +| Max Concurrency | -51.92% | -51.92% | +51.92% | -52.11% | -51.73% | 2 | + +## Per-Model Errors — Baseline Runs + +> TP=1, PP=1, max_model_len=8192, no quantization, default KV dtype. + +| Model | Arch | Weight err | Activation err | Non-torch err | KV cache err | Max conc err | +|-------|------|:----------:|:--------------:|:-------------:|:------------:|:------------:| +| Qwen2.5-7B-Instruct | Qwen2 | -0.45% | +153.39% | -37.50% | -4.21% | -4.22% | +| Qwen2.5-7B-Instruct | Qwen2 | -0.45% | +153.39% | -37.50% | -4.21% | -4.22% | +| Qwen3-30B-A3B | Qwen3Moe | -0.02% | +198.51% | -44.44% | -28.75% | -28.72% | +| Qwen3-8B | Qwen3 | -0.09% | +153.39% | -40.00% | -4.36% | -4.36% | +| DeepSeek-V2-Lite-Chat | DeepseekV2 | -0.59% | +314.51% | -42.31% | -11.50% | -11.50% | +| granite-3.1-2b-instruct | Granite | -0.44% | +633.33% | -67.39% | -5.27% | -5.27% | +| granite-3.1-8b-instruct | Granite | -0.20% | +547.06% | -67.39% | -6.02% | -6.03% | +| granite-3.3-8b-instruct | Granite | -0.20% | +547.06% | -67.39% | -6.02% | -6.03% | +| granite-vision-3.3-2b | LlavaNext* | +0.04% | +216.46% | -40.00% | -1.23% | -1.23% | +| Llama-3.1-8B-Instruct | Llama | -0.22% | +153.97% | -40.00% | -3.47% | -3.48% | +| Llama-3.1-8B-Instruct | Llama | -0.22% | +153.97% | -40.00% | -3.47% | -3.48% | +| Llama-3.1-8B-Instruct | Llama | -50.11% | +117.19% | -40.00% | +31.06% | +162.10% | +| Llama-3.1-8B-Instruct | Llama | -0.22% | +153.97% | -40.00% | -3.47% | -3.48% | +| phi-4 | Phi3 | -0.31% | +261.84% | -40.00% | -6.59% | -6.58% | +| Mistral-Small-3.1-24B-Instruct-2503 | Mistral3* | -0.08% | +23.15% | -40.00% | +1.54% | +1.55% | +| Kimi-VL-A3B-Instruct | KimiVL* | -0.58% | +173.97% | -40.00% | -9.76% | -87.31% | + +## Argument Sensitivity Analysis + +> This section examines how each vLLM launch argument affects whether the capacity planner's memory predictions remain accurate. + +### `--max-model-len` (context window size) + +| Model | max_model_len | Actual KV (GiB) | KV err | Actual tokens | Pred tokens | Token err | Max conc err | +|-------|:-------------:|:---------------:|:------:|:-------------:|:-----------:|:---------:|:------------:| +| Llama-3.1-8B-Instruct | 2,048 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | -3.47% | +| Llama-3.1-8B-Instruct | 4,096 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | -3.47% | +| Llama-3.1-8B-Instruct | 8,192 | 58.11 | -3.47% | 476,000 | 459,509 | -3.46% | -3.48% | +| Llama-3.1-8B-Instruct | 8,192 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | -3.48% | +| Llama-3.1-8B-Instruct | 8,192 | 42.80 | +31.06% | 175,296 | 459,509 | +162.13% | +162.10% | +| Llama-3.1-8B-Instruct | 8,192 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | -3.48% | +| Llama-3.1-8B-Instruct | 16,384 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | -3.44% | +| Llama-3.1-8B-Instruct | 32,768 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | -3.51% | +| Qwen2.5-7B-Instruct | 2,048 | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | -4.22% | +| Qwen2.5-7B-Instruct | 4,096 | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | -4.22% | +| Qwen2.5-7B-Instruct | 8,192 | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | -4.22% | +| Qwen2.5-7B-Instruct | 8,192 | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | -4.22% | +| Qwen2.5-7B-Instruct | 16,384 | 58.53 | -4.21% | 1,095,968 | 1,049,789 | -4.21% | -4.22% | +| Qwen2.5-7B-Instruct | 32,768 | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | -4.22% | + +**Conclusion**: `--max-model-len` has **no effect on KV pool size** — the formula and vLLM agree on this. Activation memory is constant (the fixed profiling overhead does not depend on context length), so the KV pool prediction error stays flat at ~−3 to −4% regardless of whether context is 2 K or 32 K tokens. The token/concurrency predictions carry that same constant KV error forward, plus any error from the per-token KV formula. + +### `--tensor-parallel-size` (TP) + +| Model | TP | Actual weight (GiB) | Weight err | Actual activ (GiB) | Activation err | Actual non-torch (GiB) | Non-torch err | KV cache err | +|-------|:--:|:-------------------:|:----------:|:------------------:|:--------------:|:---------------------:|:-------------:|:------------:| +| Llama-3.1-8B-Instruct | 1 | 14.99 | -0.22% | 1.89 | +153.97% | 0.25 | -40.00% | -3.47% | +| Llama-3.1-8B-Instruct | 1 | 14.99 | -0.22% | 1.89 | +153.97% | 0.25 | -40.00% | -3.47% | +| Llama-3.1-8B-Instruct | 1 | 29.98 | -50.11% | 2.21 | +117.19% | 0.25 | -40.00% | +31.06% | +| Llama-3.1-8B-Instruct | 1 | 14.99 | -0.22% | 1.89 | +153.97% | 0.25 | -40.00% | -3.47% | +| Llama-3.1-8B-Instruct | 2 | 7.51 | -0.42% | 1.89 | +153.97% | 2.07 | -71.01% | +2.76% | +| Llama-3.1-8B-Instruct | 4 | 3.77 | -0.81% | 1.89 | +153.97% | 2.13 | -71.83% | +4.48% | +| Qwen2.5-7B-Instruct | 1 | 14.25 | -0.45% | 2.21 | +153.39% | 0.24 | -37.50% | -4.21% | +| Qwen2.5-7B-Instruct | 1 | 14.25 | -0.45% | 2.21 | +153.39% | 0.24 | -37.50% | -4.21% | +| Qwen2.5-7B-Instruct | 2 | 7.12 | -0.38% | 2.21 | +153.39% | 2.06 | -70.87% | +2.61% | +| Qwen2.5-7B-Instruct | 4 | 3.55 | -0.10% | 2.21 | +153.39% | 2.13 | -71.83% | +4.62% | + +**Conclusions**: + +- **Weights scale correctly**: the formula divides by TP, matching vLLM's per-GPU weight sharding. Weight error stays near 0% across TP=1–4. +- **Activation is TP-invariant in both formula and reality**: vLLM's profiling overhead does not shrink with TP (it captures the same set of batch sizes). The formula also keeps activation constant with TP. Error stays flat. +- **Non-torch is heavily under-estimated for TP≥2**: the 0.60 GiB/GPU constant does not capture NCCL all-reduce buffer overhead, which grows with TP. Actual non-torch reaches ~2.1 GiB/GPU at TP=4 (3.5× the constant). However, this error is partially masked in KV cache accuracy because the over-estimated activation pulls the prediction in the opposite direction. + +### `--pipeline-parallel-size` (PP) + +| PP | Actual weight (GiB) | Weight err | Actual activ (GiB) | Activation err | Actual non-torch (GiB) | Non-torch err | KV cache err | +|:--:|:-------------------:|:----------:|:------------------:|:--------------:|:---------------------:|:-------------:|:------------:| +| 1 | 14.99 | -0.22% | 1.89 | +153.97% | 0.25 | -40.00% | -3.47% | +| 1 | 14.99 | -0.22% | 1.89 | +153.97% | 0.25 | -40.00% | -3.47% | +| 1 | 29.98 | -50.11% | 2.21 | +117.19% | 0.25 | -40.00% | +31.06% | +| 1 | 14.99 | -0.22% | 1.89 | +153.97% | 0.25 | -40.00% | -3.47% | +| 2 | 7.51 | -0.42% | 1.10 | +336.36% | 0.07 | +114.29% | -0.85% | +| 4 | 4.26 | -12.22% | 1.05 | +357.14% | 0.07 | +114.29% | +1.59% | + +**Conclusions**: + +- **Activation drops sharply with PP**: at PP=1, vLLM profiles 1.89 GiB of activation; at PP=2 it drops to 1.10 GiB; at PP=4 to 1.05 GiB. Each pipeline stage runs fewer transformer layers, so the profiling sweep allocates proportionally less. The formula does not account for this and always predicts 4.80 GiB, making the activation error grow with PP (from ~+154% at PP=1 to ~+357% at PP=4). +- **Non-torch increases with PP** due to inter-stage P2P send/receive buffers, but the formula uses the same TP=1 constant (0.15 GiB/GPU) regardless of PP, causing the non-torch estimate to overshoot actual (predicted > actual for PP>1 because each stage is a separate process and 0.15 is per-GPU). These two errors partially offset each other in the KV cache prediction. +- **Weight error grows with PP**: the formula divides only by TP×PP for weight sharding, but with PP=4, model layers are not uniformly distributed across stages in all cases (irregular last-stage allocation can leave a stage with fewer params). + +### `--dtype` (compute/storage dtype override) + +| dtype arg | quantization | kv_cache_dtype | Actual weight (GiB) | Weight err | Actual KV (GiB) | KV err | +|-----------|:------------:|:--------------:|:-------------------:|:----------:|:---------------:|:------:| +| bfloat16 | None | auto | 14.99 | -0.22% | 58.11 | -3.47% | +| bfloat16 | None | auto | 14.99 | -0.22% | 58.11 | -3.47% | +| bfloat16 | None | fp8 | 14.99 | -0.22% | 58.11 | -3.47% | +| bfloat16 | fp8 | auto | 8.49 | +76.18% | 64.61 | -13.18% | +| float16 | None | auto | 14.99 | -0.22% | 58.11 | -3.47% | +| float16 | compressed-tensors | auto | 8.49 | -0.35% | 64.60 | -3.11% | +| float16 | gptq_marlin | auto | 5.38 | -0.71% | 67.71 | -2.96% | +| float32 | None | auto | 29.98 | -50.11% | 42.80 | +31.06% | + +**Conclusions**: + +- **`--dtype float32`** doubles model weight memory (29.98 GiB vs BF16's 14.99 GiB). The planner reads the HuggingFace config dtype (BF16) and is unaware of the `--dtype` vLLM override, so it predicts 14.96 GiB — a **−50% weight error**, which cascades into a +31% KV cache over-prediction (the planner thinks there is more room than there is). +- **`--dtype float16`** is handled correctly because the HuggingFace config also stores float16 for these models; weight error stays near 0%. +- **FP8-dynamic quantization** (`fp8` in the quantization column) halves weight memory. The planner reads `quantization_config` from the HuggingFace repo and applies the FP8 byte-per-param, yielding near-zero weight error. KV cache error stays consistent with the activation over-estimation. +- **`--kv-cache-dtype fp8`** does not affect weight or activation predictions, but halves per-token KV storage. The planner ignores this flag and predicts KV tokens ~50% too low (see dedicated section below). + +### `--quantization` (weight quantization method) + +| Model | quant method | TP | Actual weight (GiB) | Weight err | Actual KV (GiB) | KV err | +|-------|--------------|----|:-------------------:|:----------:|:---------------:|:------:| +| Llama-3.3-70B-Instruct-fp8-dyn | compressed-tensors | 2 | 33.88 | -0.12% | 37.28 | +5.04% | +| Llama-3.3-70B-Instruct-fp8-dyn | compressed-tensors | 4 | 16.96 | -0.24% | 54.09 | +5.90% | +| Meta-Llama-3.1-8B-Instruct-qua | compressed-tensors | 1 | 8.49 | -0.35% | 64.60 | -3.11% | +| Mistral-Small-3.1-24B-Instruct | compressed-tensors | 1 | 24.07 | -0.18% | 48.73 | +1.22% | +| Mistral-Small-3.1-24B-Instruct | compressed-tensors | 2 | 12.11 | -0.79% | 59.02 | +5.28% | +| Qwen2.5-7B-Instruct-fp8-dynami | compressed-tensors | 1 | 8.14 | -0.36% | 64.64 | -3.87% | +| Qwen2.5-7B-Instruct-quantized. | compressed-tensors | 1 | 8.14 | -0.36% | 64.64 | -3.87% | +| Llama-3.3-70B-Instruct-quantiz | compressed-tensors | 2 | 33.88 | -0.12% | 37.28 | +5.04% | +| Llama-3.1-8B-Instruct | fp8 | 1 | 8.49 | +76.18% | 64.61 | -13.18% | +| Meta-Llama-3.1-8B-Instruct-qua | gptq_marlin | 1 | 5.38 | -0.71% | 67.71 | -2.96% | + +**Conclusions**: + +- **w8a8 (compressed-tensors INT8)**: the planner parses `config_groups` from the `quantization_config` to find `num_bits=8` and applies 1 byte/param. Weight errors are near zero (−0.3 to −0.7%), indicating the INT8 parameter count is well-captured. +- **w4a16 (GPTQ-marlin INT4)**: the planner parses `num_bits=4` from the quantization config and applies 0.5 bytes/param. Weight error is small (~−0.7%). The large reduction in weights (5.3 GiB vs 15 GiB for BF16) frees more KV cache, and the planner correctly tracks this effect — KV error stays in the −3% range. +- **fp8-dynamic** (fp8 per-tensor dynamic quant via `compressed-tensors`): the planner extracts fp8 precision from the quantization config. Weight error is near zero. Unexpectedly, weight error for the RedHat fp8 70B model at TP=2 stays very low, confirming the quant config parsing is correct for this variant. + +### `--kv-cache-dtype` (KV cache precision) + +| Model | kv_cache_dtype | Actual KV (GiB) | KV err | Actual tokens | Pred tokens | Token err | Conc err | +|-------|:--------------:|:---------------:|:------:|:-------------:|:-----------:|:---------:|:--------:| +| Qwen2.5-7B-Instruct | auto | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | -4.22% | +| Qwen2.5-7B-Instruct | fp8 | 58.53 | -4.21% | 2,192,000 | 1,049,789 | -52.11% | -52.11% | +||||||||| +| Llama-3.1-8B-Instruct | auto | 58.11 | -3.47% | 476,000 | 459,509 | -3.46% | -3.48% | +| Llama-3.1-8B-Instruct | fp8 | 58.11 | -3.47% | 952,032 | 459,509 | -51.73% | -51.73% | +||||||||| + +**Conclusion**: `--kv-cache-dtype fp8` stores each KV element in 1 byte instead of 2 bytes (BF16/FP16), doubling the number of tokens that fit in the KV pool. The KV pool size in GiB is unaffected (same activation and weight overhead), so the **KV GiB error stays near −4%** (the same as the default-dtype baseline). But because the planner always computes per-token bytes from the model's native compute dtype, **token count and max-concurrency predictions are ~52% too low** for fp8-KV runs. This is a direct, fixable bug: the planner should accept `kv_cache_dtype` as an input parameter and apply 1 byte/token when it is `fp8`. + +## Root Cause Analysis + +### 1. Activation Memory — Largest Error Source + +The planner uses **fixed constants per architecture** (e.g., 4.8 GiB for Llama, 5.6 GiB for Qwen2/3) empirically measured at `max_model_len=16000`. vLLM v0.19.0 reports substantially lower values during its profiling phase: + +| Architecture | Predicted (GiB) | Observed range (GiB) | Error range | +|-------------|:---------------:|:--------------------:|:-----------:| +| DeepseekV2 | 8.00 | 1.93–1.93 | +314.51% to +314.51% | +| Granite | 5.50 | 0.75–0.85 | +547.06% to +633.33% | +| KimiVL* | 8.00 | 2.85–2.92 | +173.97% to +180.70% | +| Llama | 4.80 | 1.05–2.21 | +117.19% to +357.14% | +| LlavaNext* | 2.50 | 0.79–0.79 | +216.46% to +216.46% | +| Mistral3* | 2.50 | 2.03–2.18 | +14.68% to +23.15% | +| Mixtral | 8.00 | 1.21–1.21 | +561.16% to +561.16% | +| Phi3 | 5.50 | 1.52–1.52 | +261.84% to +261.84% | +| Qwen2 | 5.60 | 2.21–2.29 | +144.54% to +153.39% | +| Qwen3 | 5.60 | 2.21–2.21 | +153.39% to +153.39% | +| Qwen3Moe | 8.00 | 2.68–2.68 | +198.51% to +198.51% | + +The discrepancy suggests the constants were measured with an older vLLM version or different compilation settings. Re-calibrating to these v0.19.0 measurements would be the highest-value fix. + +### 2. Non-torch Memory — Underestimated for Multi-GPU + +| TP | PP | Constant used | Actual mean (GiB) | Mean error | +|:--:|:--:|:-------------:|:-----------------:|:----------:| +| 1 | 1 | 0.15 GiB | 0.27 | -42.23% | +| 1 | 2 | 0.15 GiB | 0.07 | +114.29% | +| 1 | 4 | 0.15 GiB | 0.07 | +114.29% | +| 2 | 1 | 0.6 GiB | 2.08 | -71.17% | +| 4 | 1 | 0.6 GiB | 2.17 | -72.34% | + +For TP=1 the formula slightly under-estimates (0.15 vs ~0.25 GiB actual). For TP≥2, NCCL all-reduce buffers push actual non-torch to ~2.1 GiB — 3.5× the 0.60 GiB constant. For PP≥2, P2P send/receive adds overhead that the formula doesn't model at all. + +### 3. GPU Memory Catalog vs Physical + +The planner uses 80 GiB (catalog) but H100 physical VRAM is 79.19 GiB: + +- Catalog available: 80 × 0.95 = **76.00 GiB** +- Physical available: 79.19 × 0.95 = **75.23 GiB** +- Systematic KV over-prediction from this source alone: **+0.77 GiB** + +### 4. CUDA Graph Memory — Excluded from Formula + +The planner returns 0.0 GiB for CUDA graphs (treating it as included in activation). vLLM allocates the CUDA graph pool *after* sizing the KV cache, so the reported KV pool includes CUDA graph memory. The formula is therefore consistent with the log-reported KV number — no fix needed, but it should be documented. + +Observed CUDA graph pool sizes: 0.51–1.85 GiB (mean 1.03 GiB). + +## Recommendations + +| Priority | Fix | Expected impact | +|:--------:|-----|:---------------:| +| 🔴 High | **Re-calibrate activation constants** from v0.19.0 measurements. Current constants are 2–7× too high. Updating to ~1.0–2.2 GiB/architecture would remove the single largest prediction error. | +4–10 GiB KV accuracy | +| 🔴 High | **Accept `--kv-cache-dtype` as a planner input.** When set to `fp8`, halve the per-token KV bytes. This is a one-line formula change. | 2× token/concurrency accuracy for fp8-KV runs | +| 🔴 High | **Accept `--dtype` as a planner input.** When set to `float32`, double the per-param bytes for weight estimation. | Fixes −50% weight error for float32 runs | +| 🟡 Medium | **Re-measure non-torch constants for TP≥2 and PP≥2.** NCCL overhead scales with both and is currently under-estimated by ~3.5×. | +1–2 GiB KV accuracy for multi-GPU | +| 🟡 Medium | **Scale activation constant by 1/PP.** Each pipeline stage processes layers/PP transformer blocks; profiling overhead scales proportionally. | Fixes growing activation error at high PP | +| 🟢 Low | **Use physical GPU memory** (79.19 GiB for H100) rather than the catalog 80 GiB nominal. | +0.77 GiB KV accuracy | \ No newline at end of file diff --git a/accuracy/results/v0.19.0/deep_analysis.md b/accuracy/results/v0.19.0/deep_analysis.md deleted file mode 100644 index f6e07682..00000000 --- a/accuracy/results/v0.19.0/deep_analysis.md +++ /dev/null @@ -1,333 +0,0 @@ -# Capacity Planner — Deep Accuracy Analysis - -_vLLM v0.19.0 · H100-80GB · 64 runs · 23 models_ - -## Executive Summary - -**Runs analyzed**: 64 across 23 models on 1 GPU type(s). - -### Overall accuracy - -| Cohort | N | Mean err | MAE | Min / Max | ≤5% | ≤10% | -|---|---|---|---|---|---|---| -| Weight memory | 64 | -1.8% | +1.8% | -50.1% / 0.0% | 94% | 95% | -| KV cache memory | 64 | +1.9% | +7.6% | -32.7% / +61.9% | 66% | 89% | - -## By architecture type - -| Cohort | N | Mean err | MAE | Min / Max | ≤5% | ≤10% | -|---|---|---|---|---|---|---| -| **Dense** — weight | 49 | -1.8% | +1.8% | -50.1% / 0.0% | 94% | 96% | -| **Dense** — KV | 49 | +0.9% | +6.2% | -32.7% / +60.9% | 71% | 94% | -| **MoE** — weight | 11 | -2.0% | +2.0% | -11.8% / -0.0% | 91% | 91% | -| **MoE** — KV | 11 | +7.5% | +15.2% | -28.7% / +61.9% | 36% | 64% | -| **Multimodal** — weight | 4 | -0.7% | +0.7% | -1.6% / 0.0% | 100% | 100% | -| **Multimodal** — KV | 4 | -1.5% | +4.0% | -9.8% / +2.6% | 75% | 100% | - -## Per-model-family accuracy - -| Cohort | N | Mean err | MAE | Min / Max | ≤5% | ≤10% | -|---|---|---|---|---|---|---| -| **DeepSeek** — weight | 3 | -1.5% | +1.5% | -2.7% / -0.6% | 100% | 100% | -| **DeepSeek** — KV | 3 | -2.4% | +5.3% | -11.5% / +3.7% | 67% | 67% | -| **GPT-OSS (openai)** — weight | 1 | -11.8% | +11.8% | -11.8% / -11.8% | 0% | 0% | -| **GPT-OSS (openai)** — KV | 1 | +5.5% | +5.5% | +5.5% / +5.5% | 0% | 100% | -| **Granite** — weight | 6 | -0.9% | +0.9% | -1.8% / -0.2% | 100% | 100% | -| **Granite** — KV | 6 | -0.8% | +2.9% | -6.0% / +2.7% | 67% | 100% | -| **Granite-Vision** — weight | 2 | -0.4% | +0.4% | -0.7% / 0.0% | 100% | 100% | -| **Granite-Vision** — KV | 2 | +0.7% | +1.9% | -1.2% / +2.6% | 100% | 100% | -| **Kimi** — weight | 2 | -0.3% | +0.3% | -0.4% / -0.2% | 100% | 100% | -| **Kimi** — KV | 2 | +35.6% | +35.6% | +9.3% / +61.9% | 0% | 50% | -| **Kimi-VL** — weight | 2 | -1.1% | +1.1% | -1.6% / -0.6% | 100% | 100% | -| **Kimi-VL** — KV | 2 | -3.7% | +6.1% | -9.8% / +2.4% | 50% | 100% | -| **Llama-3.1** — weight | 16 | -4.2% | +4.2% | -50.1% / -0.2% | 88% | 88% | -| **Llama-3.1** — KV | 16 | +0.2% | +4.8% | -3.5% / +31.1% | 94% | 94% | -| **Llama-3.3** — weight | 5 | -0.2% | +0.2% | -0.2% / -0.1% | 100% | 100% | -| **Llama-3.3** — KV | 5 | -2.2% | +10.9% | -32.7% / +5.9% | 0% | 80% | -| **Llama-4** — weight | 1 | -4.8% | +4.8% | -4.8% / -4.8% | 100% | 100% | -| **Llama-4** — KV | 1 | +36.2% | +36.2% | +36.2% / +36.2% | 0% | 0% | -| **Mistral-Small** — weight | 5 | -1.7% | +1.7% | -5.7% / -0.1% | 80% | 100% | -| **Mistral-Small** — KV | 5 | +4.5% | +4.5% | +1.2% / +7.5% | 40% | 100% | -| **Mixtral** — weight | 2 | -0.0% | +0.0% | -0.0% / -0.0% | 100% | 100% | -| **Mixtral** — KV | 2 | +0.3% | +2.2% | -1.9% / +2.4% | 100% | 100% | -| **Phi** — weight | 2 | -0.6% | +0.6% | -0.9% / -0.3% | 100% | 100% | -| **Phi** — KV | 2 | -2.3% | +4.3% | -6.6% / +2.0% | 50% | 100% | -| **Qwen2.5** — weight | 13 | -0.4% | +0.4% | -0.4% / 0.0% | 100% | 100% | -| **Qwen2.5** — KV | 13 | +3.1% | +8.8% | -4.2% / +60.9% | 85% | 92% | -| **Qwen3** — weight | 4 | -0.1% | +0.1% | -0.3% / -0.0% | 100% | 100% | -| **Qwen3** — KV | 4 | -5.8% | +10.8% | -28.7% / +5.4% | 50% | 75% | - -## TP sensitivity - -_KV cache error grouped by tensor-parallel degree (all models). After applying the per-GPU normalisation (÷TP×PP)._ - -| Cohort | N | Mean err | MAE | Min / Max | ≤5% | ≤10% | -|---|---|---|---|---|---|---| -| TP=1 | 34 | -4.3% | +6.4% | -32.7% / +31.1% | 76% | 88% | -| TP=2 | 15 | +10.5% | +10.7% | -1.9% / +61.9% | 60% | 87% | -| TP=4 | 15 | +7.3% | +7.3% | +2.4% / +36.2% | 47% | 93% | - -## PP sensitivity - -_KV cache error grouped by pipeline-parallel degree._ - -| Cohort | N | Mean err | MAE | Min / Max | ≤5% | ≤10% | -|---|---|---|---|---|---|---| -| PP=1 | 62 | +1.9% | +7.8% | -32.7% / +61.9% | 65% | 89% | -| PP=2 | 1 | -0.9% | +0.9% | -0.9% / -0.9% | 100% | 100% | -| PP=4 | 1 | +1.6% | +1.6% | +1.6% / +1.6% | 100% | 100% | - -## Context-length sensitivity (TP=1 runs only) - -_Models tested at multiple max_model_len values. KV cache error should be constant if the formula is context-length-agnostic._ - -**Qwen/Qwen2.5-7B-Instruct** - -| max_len | KV err | -|---|---| -| 2048 | -4.2% | -| 4096 | -4.2% | -| 8192 | -4.2% | -| 8192 | -4.2% | -| 8192 | -4.2% | -| 8192 | -4.2% | -| 16384 | -4.2% | -| 32768 | -4.2% | - -**meta-llama/Llama-3.1-8B-Instruct** - -| max_len | KV err | -|---|---| -| 2048 | -3.5% | -| 4096 | -3.5% | -| 8192 | -3.5% | -| 8192 | -3.5% | -| 8192 | -3.5% | -| 8192 | +31.1% | -| 8192 | -3.5% | -| 32768 | -3.5% | - -## Outliers (|error| > 10%) - -| Model | TP | PP | Weight err | KV err | Likely cause | -|---|---|---|---|---|---| -| moonshotai/Kimi-Dev-72B | 2 | 1 | -0.2% | +61.9% | TP/PP residual: per-GPU normalisation may be imprecise | -| Qwen/Qwen2.5-72B-Instruct | 2 | 1 | -0.1% | +60.9% | TP/PP residual: per-GPU normalisation may be imprecise | -| meta-llama/Llama-4-Scout-17B-16E-Instruct | 4 | 1 | -4.8% | +36.2% | TP/PP residual: per-GPU normalisation may be imprecise | -| redhatai/Llama-3.3-70B-Instruct-quantized.w8a8 | 1 | 1 | -0.1% | -32.7% | large model: activation constant may underestimate real overhead | -| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | -50.1% | +31.1% | KV formula overestimates available budget | -| Qwen/Qwen3-30B-A3B | 1 | 1 | -0.0% | -28.7% | MoE: routing overhead not modeled in activation/KV budget | -| deepseek-ai/DeepSeek-V2-Lite-Chat | 1 | 1 | -0.6% | -11.5% | unknown | -| openai/gpt-oss-20b | 4 | 1 | -11.8% | +5.5% | MoE/sparse model: shared expert / embedding memory not sharded by TP | -| meta-llama/Llama-3.1-8B-Instruct | 1 | 4 | -12.2% | +1.6% | PP≥4: weight sharding formula incorrect for high PP | - -## Calibration notes - -### Weight memory - -- Mean error -1.8% — slightly negative (planner underestimates). Cause: safetensors metadata reports storage dtype; actual in-memory size can differ due to alignment/padding. - -- PP≥4 and certain MoE models show >10% weight error — embedding and shared-expert tensors may not be sharded by TP/PP as assumed by the formula. - -### KV cache memory (TP=1) - -- TP=1 KV mean error -4.6% (MAE +6.7%). Mostly within ±10%. - -- Consistent negative bias across TP=1 configs suggests activation_memory constant is slightly too high (over-reserves budget, leaving less for KV). - -### KV cache memory (TP>1) - -- After ÷(TP×PP) normalisation, errors are within ±10% for most models. - -- Remaining positive bias at TP=2/4 is consistent with extra NCCL/all-gather buffers not captured by non_torch constant. - -### Large-model KV outliers - -- `Qwen3-30B-A3B` (TP=1): −29%. MoE routing buffers consume more memory than modeled. - -- `Llama-3.3-70B-w8a8` (TP=1): −33%. W8A8 quantization increases activation-memory footprint (dequant workspace) not accounted for in constant. - -- `Kimi-Dev-72B` (TP=2): +62%. Likely residual normalisation issue or model-specific memory layout. - -- `Qwen2.5-72B` (TP=2): +61%. Same pattern as Kimi-Dev-72B — large model at TP=2 still shows excess after normalisation. - -## Per-model breakdown - -### Qwen/Qwen2.5-72B-Instruct _Qwen2.5 · Dense_ - -| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | -|---|---|---|---|---|---|---|---|---| -| 2 | 1 | 1 | 8192 | auto | — | auto | -0.1% | +60.9% | -| 4 | 1 | 1 | 8192 | auto | — | auto | -0.4% | +9.3% | - -### Qwen/Qwen2.5-7B-Instruct _Qwen2.5 · Dense_ - -| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | -|---|---|---|---|---|---|---|---|---| -| 1 | 1 | 1 | 2048 | auto | — | auto | -0.4% | -4.2% | -| 1 | 1 | 1 | 4096 | auto | — | auto | -0.4% | -4.2% | -| 1 | 1 | 1 | 8192 | bfloat16 | — | fp8 | -0.4% | -4.2% | -| 1 | 1 | 1 | 8192 | bfloat16 | — | auto | -0.4% | -4.2% | -| 1 | 1 | 1 | 8192 | float16 | — | auto | -0.4% | -4.2% | -| 1 | 1 | 1 | 8192 | auto | — | auto | -0.4% | -4.2% | -| 1 | 1 | 1 | 16384 | auto | — | auto | -0.4% | -4.2% | -| 1 | 1 | 1 | 32768 | auto | — | auto | -0.4% | -4.2% | -| 2 | 1 | 1 | 8192 | auto | — | auto | -0.4% | +2.6% | -| 4 | 1 | 1 | 8192 | auto | — | auto | 0.0% | +4.6% | - -### Qwen/Qwen3-30B-A3B _Qwen3 · MoE_ - -| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | -|---|---|---|---|---|---|---|---|---| -| 1 | 1 | 1 | 8192 | auto | — | auto | -0.0% | -28.7% | -| 4 | 1 | 1 | 8192 | auto | — | auto | -0.2% | +5.4% | - -### Qwen/Qwen3-8B _Qwen3 · Dense_ - -| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | -|---|---|---|---|---|---|---|---|---| -| 1 | 1 | 1 | 8192 | auto | — | auto | -0.1% | -4.4% | -| 4 | 1 | 1 | 8192 | auto | — | auto | -0.3% | +4.7% | - -### RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic _Llama-3.3 · Dense_ - -| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | -|---|---|---|---|---|---|---|---|---| -| 2 | 1 | 1 | 8192 | auto | — | auto | -0.1% | +5.0% | -| 4 | 1 | 1 | 8192 | auto | — | auto | -0.2% | +5.9% | - -### RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16 _Llama-3.1 · Dense_ - -| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | -|---|---|---|---|---|---|---|---|---| -| 1 | 1 | 1 | 8192 | float16 | — | auto | -0.7% | -3.0% | - -### RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8 _Llama-3.1 · Dense_ - -| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | -|---|---|---|---|---|---|---|---|---| -| 1 | 1 | 1 | 8192 | float16 | — | auto | -0.4% | -3.1% | -| 1 | 1 | 1 | 8192 | float16 | — | auto | -0.4% | -3.1% | -| 1 | 1 | 1 | 8192 | float16 | — | auto | -0.4% | -3.1% | - -### RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8 _Mistral-Small · Dense_ - -| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | -|---|---|---|---|---|---|---|---|---| -| 1 | 1 | 1 | 8192 | auto | — | auto | -0.2% | +1.2% | -| 2 | 1 | 1 | 8192 | auto | — | auto | -0.8% | +5.3% | - -### RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8 _Qwen2.5 · Dense_ - -| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | -|---|---|---|---|---|---|---|---|---| -| 1 | 1 | 1 | 8192 | auto | — | auto | -0.4% | -3.9% | - -### deepseek-ai/DeepSeek-V2-Lite-Chat _DeepSeek · MoE_ - -| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | -|---|---|---|---|---|---|---|---|---| -| 1 | 1 | 1 | 8192 | auto | — | auto | -0.6% | -11.5% | -| 2 | 1 | 1 | 8192 | auto | — | auto | -1.3% | +0.6% | -| 4 | 1 | 1 | 8192 | auto | — | auto | -2.7% | +3.7% | - -### ibm-granite/granite-3.1-2b-instruct _Granite · Dense_ - -| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | -|---|---|---|---|---|---|---|---|---| -| 1 | 1 | 1 | 8192 | auto | — | auto | -0.4% | -5.3% | -| 2 | 1 | 1 | 8192 | auto | — | auto | -0.8% | +0.4% | - -### ibm-granite/granite-3.1-8b-instruct _Granite · Dense_ - -| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | -|---|---|---|---|---|---|---|---|---| -| 4 | 1 | 1 | 8192 | auto | — | auto | -1.8% | +2.7% | - -### ibm-granite/granite-3.3-8b-instruct _Granite · Dense_ - -| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | -|---|---|---|---|---|---|---|---|---| -| 1 | 1 | 1 | 8192 | auto | — | auto | -0.2% | -6.0% | -| 2 | 1 | 1 | 8192 | auto | — | auto | -0.4% | +0.6% | -| 4 | 1 | 1 | 8192 | auto | — | auto | -1.8% | +2.7% | - -### ibm-granite/granite-vision-3.3-2b _Granite-Vision · Multimodal_ - -| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | -|---|---|---|---|---|---|---|---|---| -| 1 | 1 | 1 | 8192 | auto | — | auto | 0.0% | -1.2% | -| 2 | 1 | 1 | 8192 | auto | — | auto | -0.7% | +2.6% | - -### meta-llama/Llama-3.1-8B-Instruct _Llama-3.1 · Dense_ - -| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | -|---|---|---|---|---|---|---|---|---| -| 1 | 1 | 1 | 2048 | auto | — | auto | -0.2% | -3.5% | -| 1 | 1 | 1 | 4096 | auto | — | auto | -0.2% | -3.5% | -| 1 | 1 | 1 | 8192 | bfloat16 | — | fp8 | -0.2% | -3.5% | -| 1 | 1 | 1 | 8192 | bfloat16 | — | auto | -0.2% | -3.5% | -| 1 | 1 | 1 | 8192 | float16 | — | auto | -0.2% | -3.5% | -| 1 | 1 | 1 | 8192 | float32 | — | auto | -50.1% | +31.1% | -| 1 | 1 | 1 | 8192 | auto | — | auto | -0.2% | -3.5% | -| 1 | 1 | 1 | 32768 | auto | — | auto | -0.2% | -3.5% | -| 1 | 2 | 1 | 8192 | auto | — | auto | -0.4% | -0.9% | -| 1 | 4 | 1 | 8192 | auto | — | auto | -12.2% | +1.6% | -| 2 | 1 | 1 | 8192 | auto | — | auto | -0.4% | +2.8% | -| 4 | 1 | 1 | 8192 | auto | — | auto | -0.8% | +4.5% | - -### meta-llama/Llama-4-Scout-17B-16E-Instruct _Llama-4 · MoE_ - -| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | -|---|---|---|---|---|---|---|---|---| -| 4 | 1 | 1 | 8192 | auto | — | auto | -4.8% | +36.2% | - -### microsoft/phi-4 _Phi · Dense_ - -| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | -|---|---|---|---|---|---|---|---|---| -| 1 | 1 | 1 | 8192 | auto | — | auto | -0.3% | -6.6% | -| 2 | 1 | 1 | 8192 | auto | — | auto | -0.9% | +2.0% | - -### mistralai/Mistral-Small-3.1-24B-Instruct-2503 _Mistral-Small · Dense_ - -| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | -|---|---|---|---|---|---|---|---|---| -| 1 | 1 | 1 | 8192 | auto | — | auto | -0.1% | +1.6% | -| 2 | 1 | 1 | 8192 | auto | — | auto | -1.9% | +7.2% | -| 4 | 1 | 1 | 8192 | auto | — | auto | -5.7% | +7.5% | - -### mistralai/Mixtral-8x7B-Instruct-v0.1 _Mixtral · MoE_ - -| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | -|---|---|---|---|---|---|---|---|---| -| 2 | 1 | 1 | 8192 | auto | — | auto | -0.0% | -1.9% | -| 4 | 1 | 1 | 8192 | auto | — | auto | -0.0% | +2.4% | - -### moonshotai/Kimi-Dev-72B _Kimi · MoE_ - -| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | -|---|---|---|---|---|---|---|---|---| -| 2 | 1 | 1 | 8192 | auto | — | auto | -0.2% | +61.9% | -| 4 | 1 | 1 | 8192 | auto | — | auto | -0.4% | +9.3% | - -### moonshotai/Kimi-VL-A3B-Instruct _Kimi-VL · Multimodal_ - -| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | -|---|---|---|---|---|---|---|---|---| -| 1 | 1 | 1 | 8192 | auto | — | auto | -0.6% | -9.8% | -| 2 | 1 | 1 | 8192 | auto | — | auto | -1.6% | +2.4% | - -### openai/gpt-oss-20b _GPT-OSS (openai) · MoE_ - -| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | -|---|---|---|---|---|---|---|---|---| -| 4 | 1 | 1 | 8192 | auto | — | auto | -11.8% | +5.5% | - -### redhatai/Llama-3.3-70B-Instruct-quantized.w8a8 _Llama-3.3 · Dense_ - -| TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | KV err | -|---|---|---|---|---|---|---|---|---| -| 1 | 1 | 1 | 8192 | auto | — | auto | -0.1% | -32.7% | -| 2 | 1 | 1 | 8192 | auto | — | auto | -0.1% | +5.0% | -| 4 | 1 | 1 | 8192 | auto | — | auto | -0.2% | +5.9% | diff --git a/accuracy/results/v0.19.0/logs/deepseek-ai-deepseek-v2---h100-80gb--tp1pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/deepseek-ai-deepseek-v2---h100-80gb--tp1pp1dp1--8192.log new file mode 100644 index 00000000..a83b1e91 --- /dev/null +++ b/accuracy/results/v0.19.0/logs/deepseek-ai-deepseek-v2---h100-80gb--tp1pp1dp1--8192.log @@ -0,0 +1,787 @@ +DEBUG 04-22 00:45:57 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:45:57 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:45:57 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:45:57 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:45:57 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:45:57 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:45:57 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:45:57 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:45:57 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:45:57 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:45:57 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:45:57 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:46:01 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 00:46:03 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' +DEBUG 04-22 00:46:03 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 00:46:03 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:46:03 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:46:03 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 00:46:03 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:46:03 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 00:46:03 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 00:46:03 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model deepseek-ai/DeepSeek-V2-Lite-Chat +(APIServer pid=1) INFO 04-22 00:46:03 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 00:46:03 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:46:03 [entrypoints/utils.py:233] non-default args: {'model_tag': 'deepseek-ai/DeepSeek-V2-Lite-Chat', 'model': 'deepseek-ai/DeepSeek-V2-Lite-Chat', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 00:46:03 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) `rope_scaling`'s factor field must be a float >= 1, got 40 +(APIServer pid=1) `rope_scaling`'s beta_fast field must be a float, got 32 +(APIServer pid=1) `rope_scaling`'s beta_slow field must be a float, got 1 +(APIServer pid=1) DEBUG 04-22 00:46:04 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.deepseek_v2.DeepseekV2ForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 00:46:04 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0030133 secs +(APIServer pid=1) INFO 04-22 00:46:04 [config/model.py:549] Resolved architecture: DeepseekV2ForCausalLM +(APIServer pid=1) INFO 04-22 00:46:04 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 00:46:04 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 00:46:04 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 00:46:04 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 00:46:04 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 00:46:04 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 00:46:04 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 00:46:04 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 00:46:04 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 00:46:04 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:46:04 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:46:04 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 00:46:08 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:46:08 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:46:08 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:46:08 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:46:08 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:46:08 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:46:08 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:46:08 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:46:08 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:46:08 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:46:08 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:46:08 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:46:13 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=243) DEBUG 04-22 00:46:14 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 00:46:14 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=243) DEBUG 04-22 00:46:14 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/fed5a9fd-db47-4db7-9d5b-3619315b9d45'], outputs=['ipc:///tmp/61bf74bf-8446-40f3-9c7e-768ec2e40381'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=243) DEBUG 04-22 00:46:14 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=243) DEBUG 04-22 00:46:14 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=243) DEBUG 04-22 00:46:14 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=243) DEBUG 04-22 00:46:14 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=243) DEBUG 04-22 00:46:14 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=243) INFO 04-22 00:46:14 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='deepseek-ai/DeepSeek-V2-Lite-Chat', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-V2-Lite-Chat', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=deepseek-ai/DeepSeek-V2-Lite-Chat, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=243) DEBUG 04-22 00:46:15 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.7.30:43211 backend=nccl +(EngineCore pid=243) INFO 04-22 00:46:15 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.7.30:43211 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=243) DEBUG 04-22 00:46:15 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=243) INFO 04-22 00:46:15 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N/A +(EngineCore pid=243) DEBUG 04-22 00:46:15 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776818775.8321583, auto_measure=True +(EngineCore pid=243) DEBUG 04-22 00:46:15 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=243) DEBUG 04-22 00:46:15 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=243) DEBUG 04-22 00:46:15 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=243) DEBUG 04-22 00:46:15 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=243) DEBUG 04-22 00:46:15 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=243) DEBUG 04-22 00:46:15 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=243) DEBUG 04-22 00:46:15 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=243) DEBUG 04-22 00:46:16 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=243) INFO 04-22 00:46:16 [v1/worker/gpu_model_runner.py:4735] Starting to load model deepseek-ai/DeepSeek-V2-Lite-Chat... +(EngineCore pid=243) DEBUG 04-22 00:46:16 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=576, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=True, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {FLASHINFER_MLA: [compute capability not supported], FLASHMLA_SPARSE: [non-sparse not supported]}. +(EngineCore pid=243) INFO 04-22 00:46:16 [platforms/cuda.py:334] Using FLASH_ATTN_MLA attention backend out of potential backends: ['FLASH_ATTN_MLA', 'FLASHMLA', 'TRITON_MLA']. +(EngineCore pid=243) INFO 04-22 00:46:16 [model_executor/.../attention/mla_attention.py:2137] Using FlashAttention prefill for MLA +(EngineCore pid=243) INFO 04-22 00:46:16 [utils/deep_gemm.py:115] DeepGEMM E8M0 enabled on current platform. +(EngineCore pid=243) INFO 04-22 00:46:16 [model_executor/.../oracle/unquantized.py:186] Using TRITON backend for Unquantized MoE +(EngineCore pid=243) DEBUG 04-22 00:46:16 [model_executor/.../runner/default_moe_runner.py:240] Enabled separate cuda stream for MoE shared_experts +(EngineCore pid=243) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=243) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=243) DEBUG 04-22 00:46:16 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=243) DEBUG 04-22 00:46:16 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=243) DEBUG 04-22 00:46:16 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 82, 'mla_decode_concat_quant_fp8': 27, 'silu_and_mul': 27, 'fused_moe': 26, 'unquantized_fused_moe': 26, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=243) DEBUG 04-22 00:46:16 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=243) DEBUG 04-22 00:46:17 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00003-of-000004.safetensors', 'model-00002-of-000004.safetensors', 'model-00001-of-000004.safetensors', 'model-00004-of-000004.safetensors']] +(EngineCore pid=243) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/forward_context.py +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/mla_attention.py +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/config.py +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/shared_fused_moe.py +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/mla.py +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/deepseek_v2.py +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=243) INFO 04-22 00:46:49 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/3585710925/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=6e053dcd55 comp=e546579c48 code=3dff38c17fb0ff7fac2589dc3dae8f8ea483056a47cdbf285df4c2af5c769b39 dir=/data/.cache/vllm/torch_compile_cache/3585710925/rank_0_0/backbone +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] Vllm config hash: 6e053dcd55 +(EngineCore pid=243) INFO 04-22 00:46:49 [compilation/backends.py:1111] Dynamo bytecode transform time: 3.97 s +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=243) DEBUG 04-22 00:46:50 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 00:46:50 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(EngineCore pid=243) DEBUG 04-22 00:46:50 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms +(EngineCore pid=243) DEBUG 04-22 00:46:50 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 00:46:50 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=243) INFO 04-22 00:46:52 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=243) DEBUG 04-22 00:46:52 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/3585710925/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=243) DEBUG 04-22 00:46:52 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 00:46:52 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(EngineCore pid=243) DEBUG 04-22 00:46:52 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.3 ms +(EngineCore pid=243) DEBUG 04-22 00:46:52 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 00:46:52 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=243) DEBUG 04-22 00:46:53 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/3585710925/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=243) DEBUG 04-22 00:46:54 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 00:46:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(EngineCore pid=243) DEBUG 04-22 00:46:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms +(EngineCore pid=243) DEBUG 04-22 00:46:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 00:46:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(APIServer pid=1) DEBUG 04-22 00:46:54 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=243) DEBUG 04-22 00:46:55 [compilation/backends.py:377] Store the 2-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_2', '/data/.cache/vllm/torch_compile_cache/3585710925/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_2') +(EngineCore pid=243) DEBUG 04-22 00:46:56 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 00:46:56 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(EngineCore pid=243) DEBUG 04-22 00:46:56 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.7 ms +(EngineCore pid=243) DEBUG 04-22 00:46:56 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 00:46:56 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(EngineCore pid=243) DEBUG 04-22 00:46:56 [compilation/backends.py:377] Store the 27-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_27', '/data/.cache/vllm/torch_compile_cache/3585710925/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_27') +(EngineCore pid=243) INFO 04-22 00:46:56 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 7.34 s +(EngineCore pid=243) DEBUG 04-22 00:46:56 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/3585710925/rank_0_0/backbone/computation_graph.py +(EngineCore pid=243) INFO 04-22 00:46:57 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/48b804c1578e9a38cf099387383e564c087328bc0200b1c8bc2bdfac409a174b/rank_0_0/model +(EngineCore pid=243) INFO 04-22 00:46:57 [compilation/monitor.py:48] torch.compile took 12.33 s in total +(EngineCore pid=243) /usr/local/lib/python3.12/dist-packages/torch/_inductor/lowering.py:7627: UserWarning: +(EngineCore pid=243) Online softmax is disabled on the fly since Inductor decides to +(EngineCore pid=243) split the reduction. Cut an issue to PyTorch if this is an +(EngineCore pid=243) important use case and you want to speed it up with online +(EngineCore pid=243) softmax. +(EngineCore pid=243) +(EngineCore pid=243) warnings.warn( +(EngineCore pid=243) WARNING 04-22 00:46:59 [model_executor/.../fused_moe/fused_moe.py:1090] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=64,N=1408,device_name=NVIDIA_H100_80GB_HBM3.json +(EngineCore pid=243) INFO 04-22 00:47:00 [compilation/monitor.py:76] Initial profiling/warmup run took 3.11 s +(APIServer pid=1) DEBUG 04-22 00:47:04 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=243) INFO 04-22 00:47:06 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=243) DEBUG 04-22 00:47:06 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=243) INFO 04-22 00:47:06 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=243) DEBUG 04-22 00:47:06 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:47:06 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:47:06 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 00:47:06 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:47:06 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:47:06 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 00:47:06 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 106.00 MiB first-capture + (51-1) × 10.00 MiB per-graph +(EngineCore pid=243) DEBUG 04-22 00:47:06 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:47:06 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:47:06 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 00:47:06 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:47:06 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:47:06 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 00:47:07 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 522.00 MiB first-capture + (51-1) × 8.00 MiB per-graph +(EngineCore pid=243) DEBUG 04-22 00:47:07 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=243) INFO 04-22 00:47:07 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.39 GiB total +(EngineCore pid=243) DEBUG 04-22 00:47:07 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=243) DEBUG 04-22 00:47:07 [v1/worker/gpu_worker.py:430] Free memory after profiling: 48.34 GiB (total), 44.89 GiB (within requested) +(EngineCore pid=243) DEBUG 04-22 00:47:07 [v1/worker/gpu_worker.py:435] Memory profiling takes 22.62 seconds. Total non KV cache memory: 31.62GiB; torch peak memory increase: 1.93GiB; non-torch forward increase memory: 0.26GiB; weights memory: 29.43GiB. +(EngineCore pid=243) INFO 04-22 00:47:07 [v1/worker/gpu_worker.py:436] Available KV cache memory: 43.61 GiB +(EngineCore pid=243) INFO 04-22 00:47:07 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9675 to maintain the same effective KV cache size. +(EngineCore pid=243) INFO 04-22 00:47:07 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 1,505,552 tokens +(EngineCore pid=243) INFO 04-22 00:47:07 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 183.78x +(EngineCore pid=243) 2026-04-22 00:47:07,885 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=243) DEBUG 04-22 00:47:07 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) 2026-04-22 00:47:07,915 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=243) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00= 1, got 40 +(APIServer pid=1) `rope_scaling`'s beta_fast field must be a float, got 32 +(APIServer pid=1) `rope_scaling`'s beta_slow field must be a float, got 1 +(APIServer pid=1) INFO 04-22 00:47:21 [renderers/hf.py:314] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this. +(APIServer pid=1) DEBUG 04-22 00:47:21 [renderers/base.py:203] Chat template warmup completed in 0.695s +(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/openai/api_server.py:594] Starting vLLM server on http://0.0.0.0:8000 +(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:37] Available routes are: +(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /openapi.json, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /docs, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /docs/oauth2-redirect, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /redoc, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /tokenize, Methods: POST +(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /detokenize, Methods: POST +(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /load, Methods: GET +(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /version, Methods: GET +(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /health, Methods: GET +(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /metrics, Methods: GET +(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /v1/models, Methods: GET +(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /ping, Methods: GET +(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /ping, Methods: POST +(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /invocations, Methods: POST +(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /v1/chat/completions, Methods: POST +(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST +(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /v1/responses, Methods: POST +(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET +(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST +(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /v1/completions, Methods: POST +(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /v1/messages, Methods: POST +(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST +(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /inference/v1/generate, Methods: POST +(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /scale_elastic_ep, Methods: POST +(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST +(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /v1/chat/completions/render, Methods: POST +(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /v1/completions/render, Methods: POST +(APIServer pid=1) INFO: Started server process [1] +(APIServer pid=1) INFO: Waiting for application startup. +(APIServer pid=1) INFO: Application startup complete. +(APIServer pid=1) DEBUG 04-22 00:47:22 [v1/engine/async_llm.py:875] Called check_health. +(APIServer pid=1) INFO: 10.129.6.2:33478 - "GET /health HTTP/1.1" 200 OK diff --git a/accuracy/results/v0.19.0/logs/ibm-granite-granite-3-3---h100-80gb--tp1pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/ibm-granite-granite-3-3---h100-80gb--tp1pp1dp1--8192.log new file mode 100644 index 00000000..c6bf2f93 --- /dev/null +++ b/accuracy/results/v0.19.0/logs/ibm-granite-granite-3-3---h100-80gb--tp1pp1dp1--8192.log @@ -0,0 +1,746 @@ +DEBUG 04-22 00:49:59 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:49:59 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:49:59 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:49:59 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:49:59 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:49:59 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:49:59 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:49:59 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:49:59 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:49:59 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:49:59 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:49:59 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:50:03 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 00:50:05 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' +DEBUG 04-22 00:50:05 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 00:50:05 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:50:05 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:50:05 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 00:50:05 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:50:05 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 00:50:05 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 00:50:05 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model ibm-granite/granite-3.3-8b-instruct +(APIServer pid=1) INFO 04-22 00:50:05 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 00:50:05 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:50:05 [entrypoints/utils.py:233] non-default args: {'model_tag': 'ibm-granite/granite-3.3-8b-instruct', 'model': 'ibm-granite/granite-3.3-8b-instruct', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 00:50:05 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 00:50:06 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.granite.GraniteForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 00:50:06 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003631 secs +(APIServer pid=1) INFO 04-22 00:50:06 [config/model.py:549] Resolved architecture: GraniteForCausalLM +(APIServer pid=1) INFO 04-22 00:50:06 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 00:50:06 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 00:50:06 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 00:50:06 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 00:50:06 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 00:50:06 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 00:50:06 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 00:50:06 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 00:50:06 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 00:50:06 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:50:06 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:50:06 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 00:50:10 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:50:10 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:50:10 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:50:10 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:50:10 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:50:10 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:50:10 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:50:10 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:50:10 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:50:10 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:50:10 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:50:10 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:50:14 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=244) DEBUG 04-22 00:50:16 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 00:50:16 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=244) DEBUG 04-22 00:50:16 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/e19e2dca-0a9e-477a-b371-19c9091b3f73'], outputs=['ipc:///tmp/4e624f2d-09ca-410a-b00b-42134da1617e'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=244) DEBUG 04-22 00:50:16 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=244) DEBUG 04-22 00:50:16 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=244) DEBUG 04-22 00:50:16 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=244) DEBUG 04-22 00:50:16 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=244) DEBUG 04-22 00:50:16 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=244) INFO 04-22 00:50:16 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='ibm-granite/granite-3.3-8b-instruct', speculative_config=None, tokenizer='ibm-granite/granite-3.3-8b-instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=ibm-granite/granite-3.3-8b-instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=244) DEBUG 04-22 00:50:16 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.131.7.81:39775 backend=nccl +(EngineCore pid=244) INFO 04-22 00:50:16 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.131.7.81:39775 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=244) DEBUG 04-22 00:50:16 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=244) INFO 04-22 00:50:16 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=244) DEBUG 04-22 00:50:17 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776819017.1554534, auto_measure=True +(EngineCore pid=244) DEBUG 04-22 00:50:17 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=244) DEBUG 04-22 00:50:17 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=244) DEBUG 04-22 00:50:17 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=244) DEBUG 04-22 00:50:17 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=244) DEBUG 04-22 00:50:17 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=244) DEBUG 04-22 00:50:17 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=244) DEBUG 04-22 00:50:17 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=244) DEBUG 04-22 00:50:17 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=244) DEBUG 04-22 00:50:17 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=244) INFO 04-22 00:50:17 [v1/worker/gpu_model_runner.py:4735] Starting to load model ibm-granite/granite-3.3-8b-instruct... +(EngineCore pid=244) DEBUG 04-22 00:50:17 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=244) INFO 04-22 00:50:17 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=244) INFO 04-22 00:50:17 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=244) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=244) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=244) DEBUG 04-22 00:50:18 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=244) DEBUG 04-22 00:50:18 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=244) DEBUG 04-22 00:50:18 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 81, 'silu_and_mul': 40, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=244) DEBUG 04-22 00:50:18 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=244) DEBUG 04-22 00:50:18 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00001-of-00004.safetensors']] +(EngineCore pid=244) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 +(APIServer pid=1) DEBUG 04-22 00:50:36 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/granite.py +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=244) INFO 04-22 00:50:37 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/ad49f5c00b/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=e817fe24ec comp=e546579c48 code=9615bc1b6a3cdadf99f40f12188b347f2842282ef159341e824808308c14a2aa dir=/data/.cache/vllm/torch_compile_cache/ad49f5c00b/rank_0_0/backbone +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] Vllm config hash: e817fe24ec +(EngineCore pid=244) INFO 04-22 00:50:37 [compilation/backends.py:1111] Dynamo bytecode transform time: 5.01 s +(EngineCore pid=244) DEBUG 04-22 00:50:38 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=244) DEBUG 04-22 00:50:38 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=244) DEBUG 04-22 00:50:38 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=244) DEBUG 04-22 00:50:38 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms +(EngineCore pid=244) DEBUG 04-22 00:50:38 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms +(EngineCore pid=244) DEBUG 04-22 00:50:38 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=244) DEBUG 04-22 00:50:38 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=244) INFO 04-22 00:50:40 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=244) DEBUG 04-22 00:50:40 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/ad49f5c00b/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=244) DEBUG 04-22 00:50:40 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=244) DEBUG 04-22 00:50:40 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(EngineCore pid=244) DEBUG 04-22 00:50:40 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms +(EngineCore pid=244) DEBUG 04-22 00:50:40 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=244) DEBUG 04-22 00:50:40 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=244) DEBUG 04-22 00:50:41 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/ad49f5c00b/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=244) DEBUG 04-22 00:50:42 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=244) DEBUG 04-22 00:50:42 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms +(EngineCore pid=244) DEBUG 04-22 00:50:42 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms +(EngineCore pid=244) DEBUG 04-22 00:50:42 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=244) DEBUG 04-22 00:50:42 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(EngineCore pid=244) DEBUG 04-22 00:50:43 [compilation/backends.py:377] Store the 40-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_40', '/data/.cache/vllm/torch_compile_cache/ad49f5c00b/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_40') +(EngineCore pid=244) INFO 04-22 00:50:43 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.66 s +(EngineCore pid=244) DEBUG 04-22 00:50:43 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/ad49f5c00b/rank_0_0/backbone/computation_graph.py +(EngineCore pid=244) INFO 04-22 00:50:44 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/e0255b29bbc50ba3ddb543906d793d5561501aae395c54778798896484ebac2c/rank_0_0/model +(EngineCore pid=244) INFO 04-22 00:50:44 [compilation/monitor.py:48] torch.compile took 11.95 s in total +(EngineCore pid=244) INFO 04-22 00:50:45 [compilation/monitor.py:76] Initial profiling/warmup run took 0.47 s +(APIServer pid=1) DEBUG 04-22 00:50:46 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=244) INFO 04-22 00:50:53 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=244) DEBUG 04-22 00:50:53 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=244) INFO 04-22 00:50:53 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=244) DEBUG 04-22 00:50:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:50:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:50:54 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-22 00:50:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:50:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:50:54 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-22 00:50:54 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 124.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=244) DEBUG 04-22 00:50:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:50:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:50:54 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-22 00:50:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:50:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:50:54 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-22 00:50:54 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 262.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(EngineCore pid=244) DEBUG 04-22 00:50:54 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=244) INFO 04-22 00:50:54 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.74 GiB total +(EngineCore pid=244) DEBUG 04-22 00:50:54 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=244) DEBUG 04-22 00:50:54 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.7 GiB (total), 59.26 GiB (within requested) +(EngineCore pid=244) DEBUG 04-22 00:50:54 [v1/worker/gpu_worker.py:435] Memory profiling takes 22.12 seconds. Total non KV cache memory: 16.57GiB; torch peak memory increase: 0.85GiB; non-torch forward increase memory: 0.46GiB; weights memory: 15.25GiB. +(EngineCore pid=244) INFO 04-22 00:50:54 [v1/worker/gpu_worker.py:436] Available KV cache memory: 58.66 GiB +(EngineCore pid=244) INFO 04-22 00:50:54 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9594 to maintain the same effective KV cache size. +(EngineCore pid=244) INFO 04-22 00:50:54 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 384,432 tokens +(EngineCore pid=244) INFO 04-22 00:50:54 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 46.93x +(EngineCore pid=244) 2026-04-22 00:50:54,971 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=244) DEBUG 04-22 00:50:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) 2026-04-22 00:50:54,983 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=244) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:51:21 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:51:21 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 00:51:21 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:51:21 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 00:51:21 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 00:51:21 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model ibm-granite/granite-vision-3.3-2b +(APIServer pid=1) INFO 04-22 00:51:21 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 00:51:21 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:51:21 [entrypoints/utils.py:233] non-default args: {'model_tag': 'ibm-granite/granite-vision-3.3-2b', 'model': 'ibm-granite/granite-vision-3.3-2b', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 00:51:21 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 00:51:21 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llava_next.LlavaNextForConditionalGeneration from cache +(APIServer pid=1) DEBUG 04-22 00:51:21 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0015486 secs +(APIServer pid=1) INFO 04-22 00:51:21 [config/model.py:549] Resolved architecture: LlavaNextForConditionalGeneration +(APIServer pid=1) INFO 04-22 00:51:21 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 00:51:21 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 00:51:21 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 00:51:21 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 00:51:21 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 00:51:21 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 00:51:21 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 00:51:21 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 00:51:21 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 00:51:21 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:51:22 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:51:22 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. +(APIServer pid=1) DEBUG 04-22 00:51:22 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 00:51:27 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:51:27 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:51:27 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:51:27 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:51:27 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:51:27 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:51:27 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:51:27 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:51:27 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:51:27 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:51:27 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:51:27 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:51:32 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=467) DEBUG 04-22 00:51:34 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 00:51:34 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=467) DEBUG 04-22 00:51:34 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/832edc0f-9891-4845-ba97-32535407db1d'], outputs=['ipc:///tmp/be266a52-4723-4add-a4ab-66d2955016e6'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=467) DEBUG 04-22 00:51:34 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=467) DEBUG 04-22 00:51:34 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=467) DEBUG 04-22 00:51:34 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=467) DEBUG 04-22 00:51:34 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=467) DEBUG 04-22 00:51:34 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=467) INFO 04-22 00:51:34 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='ibm-granite/granite-vision-3.3-2b', speculative_config=None, tokenizer='ibm-granite/granite-vision-3.3-2b', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=ibm-granite/granite-vision-3.3-2b, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=467) DEBUG 04-22 00:51:34 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(EngineCore pid=467) DEBUG 04-22 00:51:35 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.7.32:52891 backend=nccl +(EngineCore pid=467) INFO 04-22 00:51:35 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.7.32:52891 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=467) DEBUG 04-22 00:51:35 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=467) INFO 04-22 00:51:35 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=467) DEBUG 04-22 00:51:35 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776819095.7213402, auto_measure=True +(EngineCore pid=467) DEBUG 04-22 00:51:35 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=467) DEBUG 04-22 00:51:35 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=467) DEBUG 04-22 00:51:35 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=467) DEBUG 04-22 00:51:35 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=467) DEBUG 04-22 00:51:35 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=467) DEBUG 04-22 00:51:35 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=467) DEBUG 04-22 00:51:35 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=467) DEBUG 04-22 00:51:35 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. +(EngineCore pid=467) DEBUG 04-22 00:51:37 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=467) INFO 04-22 00:51:37 [v1/worker/gpu_model_runner.py:4735] Starting to load model ibm-granite/granite-vision-3.3-2b... +(EngineCore pid=467) INFO 04-22 00:51:37 [model_executor/models/interfaces.py:171] Contains out of vocabulary multimodal tokens? False +(EngineCore pid=467) INFO 04-22 00:51:37 [platforms/cuda.py:390] Using backend AttentionBackendEnum.FLASH_ATTN for vit attention +(EngineCore pid=467) INFO 04-22 00:51:37 [model_executor/.../attention/mm_encoder_attention.py:230] Using AttentionBackendEnum.FLASH_ATTN for MMEncoderAttention. +(EngineCore pid=467) INFO 04-22 00:51:37 [config/vllm.py:790] Asynchronous scheduling is enabled. +(EngineCore pid=467) DEBUG 04-22 00:51:37 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=467) DEBUG 04-22 00:51:38 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=64, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=467) INFO 04-22 00:51:38 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=467) INFO 04-22 00:51:38 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=467) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=467) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=467) DEBUG 04-22 00:51:38 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=467) DEBUG 04-22 00:51:38 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 27}) +(EngineCore pid=467) DEBUG 04-22 00:51:38 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 81, 'silu_and_mul': 40, 'conv2d': 1, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=467) DEBUG 04-22 00:51:38 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 27}) +(EngineCore pid=467) DEBUG 04-22 00:51:38 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 81, 'silu_and_mul': 40, 'conv2d': 1, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=467) DEBUG 04-22 00:51:38 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=467) DEBUG 04-22 00:51:38 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00001-of-00002.safetensors', 'model-00002-of-00002.safetensors']] +(EngineCore pid=467) Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00 +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/granite.py +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=467) INFO 04-22 00:51:49 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/56a7cc408f/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=107570cfa8 comp=e546579c48 code=bf166f325866070c057071cfb4009752565a89cce07da824263e5292c4847928 dir=/data/.cache/vllm/torch_compile_cache/56a7cc408f/rank_0_0/backbone +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] Vllm config hash: 107570cfa8 +(EngineCore pid=467) INFO 04-22 00:51:49 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.91 s +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=467) DEBUG 04-22 00:51:50 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=467) DEBUG 04-22 00:51:50 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms +(EngineCore pid=467) DEBUG 04-22 00:51:50 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.7 ms +(EngineCore pid=467) DEBUG 04-22 00:51:50 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=467) DEBUG 04-22 00:51:50 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=467) INFO 04-22 00:51:51 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=467) DEBUG 04-22 00:51:51 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/56a7cc408f/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=467) DEBUG 04-22 00:51:52 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=467) DEBUG 04-22 00:51:52 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(EngineCore pid=467) DEBUG 04-22 00:51:52 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.0 ms +(EngineCore pid=467) DEBUG 04-22 00:51:52 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=467) DEBUG 04-22 00:51:52 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=467) DEBUG 04-22 00:51:53 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/56a7cc408f/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(APIServer pid=1) DEBUG 04-22 00:51:54 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=467) DEBUG 04-22 00:51:54 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=467) DEBUG 04-22 00:51:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms +(EngineCore pid=467) DEBUG 04-22 00:51:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms +(EngineCore pid=467) DEBUG 04-22 00:51:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=467) DEBUG 04-22 00:51:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(EngineCore pid=467) DEBUG 04-22 00:51:54 [compilation/backends.py:377] Store the 40-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_40', '/data/.cache/vllm/torch_compile_cache/56a7cc408f/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_40') +(EngineCore pid=467) INFO 04-22 00:51:54 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.13 s +(EngineCore pid=467) DEBUG 04-22 00:51:54 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/56a7cc408f/rank_0_0/backbone/computation_graph.py +(EngineCore pid=467) INFO 04-22 00:51:56 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/6b5b3eda07b021ffbc2d42a757d2f8d9e2ac125e9658c69b9579ea1352bc8d9a/rank_0_0/model +(EngineCore pid=467) INFO 04-22 00:51:56 [compilation/monitor.py:48] torch.compile took 11.70 s in total +(EngineCore pid=467) INFO 04-22 00:51:56 [compilation/monitor.py:76] Initial profiling/warmup run took 0.10 s +(EngineCore pid=467) INFO 04-22 00:52:01 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=467) DEBUG 04-22 00:52:01 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=467) INFO 04-22 00:52:01 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=467) DEBUG 04-22 00:52:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=467) DEBUG 04-22 00:52:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=467) DEBUG 04-22 00:52:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=467) DEBUG 04-22 00:52:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=467) DEBUG 04-22 00:52:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=467) DEBUG 04-22 00:52:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=467) DEBUG 04-22 00:52:02 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 100.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=467) DEBUG 04-22 00:52:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=467) DEBUG 04-22 00:52:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=467) DEBUG 04-22 00:52:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=467) DEBUG 04-22 00:52:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=467) DEBUG 04-22 00:52:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=467) DEBUG 04-22 00:52:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=467) DEBUG 04-22 00:52:02 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 134.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(EngineCore pid=467) DEBUG 04-22 00:52:02 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=467) INFO 04-22 00:52:02 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.62 GiB total +(EngineCore pid=467) DEBUG 04-22 00:52:03 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=467) DEBUG 04-22 00:52:03 [v1/worker/gpu_worker.py:430] Free memory after profiling: 72.63 GiB (total), 69.18 GiB (within requested) +(EngineCore pid=467) DEBUG 04-22 00:52:03 [v1/worker/gpu_worker.py:435] Memory profiling takes 18.74 seconds. Total non KV cache memory: 6.58GiB; torch peak memory increase: 0.79GiB; non-torch forward increase memory: 0.25GiB; weights memory: 5.54GiB. +(EngineCore pid=467) INFO 04-22 00:52:03 [v1/worker/gpu_worker.py:436] Available KV cache memory: 68.65 GiB +(EngineCore pid=467) INFO 04-22 00:52:03 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9578 to maintain the same effective KV cache size. +(EngineCore pid=467) INFO 04-22 00:52:03 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 899,792 tokens +(EngineCore pid=467) INFO 04-22 00:52:03 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 109.84x +(EngineCore pid=467) 2026-04-22 00:52:03,056 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=467) DEBUG 04-22 00:52:03 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=467) 2026-04-22 00:52:03,068 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=467) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:23:30 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:23:30 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 00:23:30 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:23:30 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 00:23:30 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 00:23:30 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-3.1-8B-Instruct +(APIServer pid=1) INFO 04-22 00:23:30 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 00:23:30 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:23:30 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-3.1-8B-Instruct', 'model': 'meta-llama/Llama-3.1-8B-Instruct', 'dtype': 'bfloat16', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'kv_cache_dtype': 'fp8', 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 00:23:30 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 00:23:31 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 00:23:31 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003864 secs +(APIServer pid=1) INFO 04-22 00:23:31 [config/model.py:549] Resolved architecture: LlamaForCausalLM +(APIServer pid=1) INFO 04-22 00:23:31 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 00:23:31 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 00:23:31 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 00:23:31 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 00:23:31 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 00:23:31 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 00:23:31 [config/cache.py:227] Using fp8 data type to store kv cache. It reduces the GPU memory footprint and boosts the performance. Meanwhile, it may cause accuracy drop without a proper scaling factor. +(APIServer pid=1) INFO 04-22 00:23:31 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 00:23:31 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 00:23:31 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 00:23:31 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:23:31 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:23:31 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 00:23:35 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:23:35 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:23:35 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:23:35 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:23:35 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:23:35 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:23:35 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:23:35 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:23:35 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:23:35 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:23:35 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:23:35 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:23:40 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=241) DEBUG 04-22 00:23:41 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 00:23:41 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=241) DEBUG 04-22 00:23:41 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/ee17de87-bccf-49f5-a945-1c7fdda0e114'], outputs=['ipc:///tmp/19d96faf-9b91-482b-9350-d3006f87dc8b'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=241) DEBUG 04-22 00:23:41 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=241) DEBUG 04-22 00:23:41 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=241) DEBUG 04-22 00:23:41 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=241) DEBUG 04-22 00:23:41 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=241) DEBUG 04-22 00:23:41 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=241) INFO 04-22 00:23:41 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=fp8, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=241) DEBUG 04-22 00:23:42 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.4.193:53205 backend=nccl +(EngineCore pid=241) INFO 04-22 00:23:42 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.4.193:53205 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=241) DEBUG 04-22 00:23:42 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=241) INFO 04-22 00:23:42 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=241) DEBUG 04-22 00:23:42 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776817422.4285054, auto_measure=True +(EngineCore pid=241) DEBUG 04-22 00:23:42 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=241) DEBUG 04-22 00:23:42 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=241) DEBUG 04-22 00:23:42 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=241) DEBUG 04-22 00:23:42 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=241) DEBUG 04-22 00:23:42 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=241) DEBUG 04-22 00:23:42 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=241) DEBUG 04-22 00:23:42 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=241) DEBUG 04-22 00:23:42 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=241) INFO 04-22 00:23:42 [v1/worker/gpu_model_runner.py:4735] Starting to load model meta-llama/Llama-3.1-8B-Instruct... +(EngineCore pid=241) DEBUG 04-22 00:23:43 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=fp8, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {FLEX_ATTENTION: [kv_cache_dtype not supported]}. +(EngineCore pid=241) INFO 04-22 00:23:43 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN']. +(EngineCore pid=241) INFO 04-22 00:23:43 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=241) INFO 04-22 00:23:43 [utils/deep_gemm.py:115] DeepGEMM E8M0 enabled on current platform. +(EngineCore pid=241) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=241) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=241) DEBUG 04-22 00:23:43 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=241) DEBUG 04-22 00:23:43 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=241) DEBUG 04-22 00:23:43 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'quant_fp8': 32, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=241) DEBUG 04-22 00:23:43 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=241) DEBUG 04-22 00:23:43 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00004-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'model-00002-of-00004.safetensors']] +(EngineCore pid=241) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 +(APIServer pid=1) DEBUG 04-22 00:23:51 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/input_quant_fp8.py +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/utils/quant_utils.py +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=241) INFO 04-22 00:23:54 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/16bd0cf229/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=a3a40b90b5 comp=e546579c48 code=a481fe428ff1d132f00a2eb629698322104288cdeebd6384595457f75f95534e dir=/data/.cache/vllm/torch_compile_cache/16bd0cf229/rank_0_0/backbone +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] Vllm config hash: a3a40b90b5 +(EngineCore pid=241) INFO 04-22 00:23:54 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.56 s +(EngineCore pid=241) DEBUG 04-22 00:23:55 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=241) DEBUG 04-22 00:23:55 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=241) DEBUG 04-22 00:23:55 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=241) DEBUG 04-22 00:23:55 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms +(EngineCore pid=241) DEBUG 04-22 00:23:55 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms +(EngineCore pid=241) DEBUG 04-22 00:23:55 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=241) DEBUG 04-22 00:23:55 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=241) INFO 04-22 00:23:57 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=241) DEBUG 04-22 00:23:57 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/16bd0cf229/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=241) DEBUG 04-22 00:23:57 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=241) DEBUG 04-22 00:23:57 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(EngineCore pid=241) DEBUG 04-22 00:23:57 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.2 ms +(EngineCore pid=241) DEBUG 04-22 00:23:57 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=241) DEBUG 04-22 00:23:57 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=241) DEBUG 04-22 00:23:59 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/16bd0cf229/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=241) DEBUG 04-22 00:24:00 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=241) DEBUG 04-22 00:24:00 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms +(EngineCore pid=241) DEBUG 04-22 00:24:00 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms +(EngineCore pid=241) DEBUG 04-22 00:24:00 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=241) DEBUG 04-22 00:24:00 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(EngineCore pid=241) DEBUG 04-22 00:24:00 [compilation/backends.py:377] Store the 32-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_32', '/data/.cache/vllm/torch_compile_cache/16bd0cf229/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_32') +(EngineCore pid=241) INFO 04-22 00:24:00 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.35 s +(EngineCore pid=241) DEBUG 04-22 00:24:00 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/16bd0cf229/rank_0_0/backbone/computation_graph.py +(APIServer pid=1) DEBUG 04-22 00:24:01 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=241) INFO 04-22 00:24:01 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/fadd5d0dcb1cecef36f6290c55d85692abb624cc2fa3646ab99c523f39738c6c/rank_0_0/model +(EngineCore pid=241) INFO 04-22 00:24:01 [compilation/monitor.py:48] torch.compile took 11.42 s in total +(EngineCore pid=241) INFO 04-22 00:24:02 [compilation/monitor.py:76] Initial profiling/warmup run took 0.45 s +(EngineCore pid=241) INFO 04-22 00:24:07 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=241) DEBUG 04-22 00:24:07 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=241) INFO 04-22 00:24:07 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=241) DEBUG 04-22 00:24:07 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=241) DEBUG 04-22 00:24:07 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=241) DEBUG 04-22 00:24:07 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=241) DEBUG 04-22 00:24:07 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=241) DEBUG 04-22 00:24:07 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=241) DEBUG 04-22 00:24:07 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=241) DEBUG 04-22 00:24:07 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 124.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=241) DEBUG 04-22 00:24:07 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=241) DEBUG 04-22 00:24:07 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=241) DEBUG 04-22 00:24:07 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=241) DEBUG 04-22 00:24:07 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=241) DEBUG 04-22 00:24:07 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=241) DEBUG 04-22 00:24:07 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=241) DEBUG 04-22 00:24:07 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 260.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=241) DEBUG 04-22 00:24:08 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=241) INFO 04-22 00:24:08 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.84 GiB total +(EngineCore pid=241) DEBUG 04-22 00:24:08 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=241) DEBUG 04-22 00:24:08 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.24 GiB (total), 58.79 GiB (within requested) +(EngineCore pid=241) DEBUG 04-22 00:24:08 [v1/worker/gpu_worker.py:435] Memory profiling takes 18.33 seconds. Total non KV cache memory: 17.12GiB; torch peak memory increase: 1.89GiB; non-torch forward increase memory: 0.25GiB; weights memory: 14.99GiB. +(EngineCore pid=241) INFO 04-22 00:24:08 [v1/worker/gpu_worker.py:436] Available KV cache memory: 58.11 GiB +(EngineCore pid=241) INFO 04-22 00:24:08 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9606 to maintain the same effective KV cache size. +(EngineCore pid=241) INFO 04-22 00:24:08 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 952,032 tokens +(EngineCore pid=241) INFO 04-22 00:24:08 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 116.21x +(EngineCore pid=241) 2026-04-22 00:24:08,713 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=241) DEBUG 04-22 00:24:08 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=241) 2026-04-22 00:24:08,721 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=241) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:06:59 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:06:59 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 00:06:59 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:06:59 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 00:06:59 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 00:06:59 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-3.1-8B-Instruct +(APIServer pid=1) INFO 04-22 00:06:59 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 00:06:59 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:06:59 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-3.1-8B-Instruct', 'model': 'meta-llama/Llama-3.1-8B-Instruct', 'dtype': 'bfloat16', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 00:06:59 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 00:07:00 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 00:07:00 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003826 secs +(APIServer pid=1) INFO 04-22 00:07:00 [config/model.py:549] Resolved architecture: LlamaForCausalLM +(APIServer pid=1) INFO 04-22 00:07:00 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 00:07:00 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 00:07:00 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 00:07:00 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 00:07:00 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 00:07:00 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 00:07:00 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 00:07:00 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 00:07:00 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 00:07:00 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:07:00 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:07:00 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 00:07:04 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:07:04 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:07:04 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:07:04 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:07:04 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:07:04 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:07:04 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:07:04 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:07:04 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:07:04 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:07:04 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:07:04 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:07:08 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=244) DEBUG 04-22 00:07:10 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 00:07:10 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=244) DEBUG 04-22 00:07:10 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/1660a71c-bad1-4fd1-97d6-d5652f5ff393'], outputs=['ipc:///tmp/c79bb2ea-2ffb-494e-95e5-68061e6538d9'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=244) DEBUG 04-22 00:07:10 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=244) DEBUG 04-22 00:07:10 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=244) DEBUG 04-22 00:07:10 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=244) DEBUG 04-22 00:07:10 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=244) DEBUG 04-22 00:07:10 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=244) INFO 04-22 00:07:10 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=244) DEBUG 04-22 00:07:10 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.4.187:47257 backend=nccl +(EngineCore pid=244) INFO 04-22 00:07:10 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.4.187:47257 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=244) DEBUG 04-22 00:07:10 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=244) INFO 04-22 00:07:10 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=244) DEBUG 04-22 00:07:11 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776816431.302875, auto_measure=True +(EngineCore pid=244) DEBUG 04-22 00:07:11 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=244) DEBUG 04-22 00:07:11 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=244) DEBUG 04-22 00:07:11 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=244) DEBUG 04-22 00:07:11 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=244) DEBUG 04-22 00:07:11 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=244) DEBUG 04-22 00:07:11 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=244) DEBUG 04-22 00:07:11 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=244) DEBUG 04-22 00:07:11 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=244) INFO 04-22 00:07:11 [v1/worker/gpu_model_runner.py:4735] Starting to load model meta-llama/Llama-3.1-8B-Instruct... +(EngineCore pid=244) DEBUG 04-22 00:07:12 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=244) INFO 04-22 00:07:12 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=244) INFO 04-22 00:07:12 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=244) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=244) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=244) DEBUG 04-22 00:07:12 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=244) DEBUG 04-22 00:07:12 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=244) DEBUG 04-22 00:07:12 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=244) DEBUG 04-22 00:07:12 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=244) DEBUG 04-22 00:07:12 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00004-of-00004.safetensors', 'model-00002-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00001-of-00004.safetensors']] +(EngineCore pid=244) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:05:59 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:05:59 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 00:05:59 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:05:59 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 00:05:59 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 00:05:59 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-3.1-8B-Instruct +(APIServer pid=1) INFO 04-22 00:05:59 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 00:05:59 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:05:59 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-3.1-8B-Instruct', 'model': 'meta-llama/Llama-3.1-8B-Instruct', 'dtype': 'float16', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 00:05:59 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 00:05:59 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 00:05:59 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003684 secs +(APIServer pid=1) INFO 04-22 00:05:59 [config/model.py:549] Resolved architecture: LlamaForCausalLM +(APIServer pid=1) WARNING 04-22 00:05:59 [config/model.py:2016] Casting torch.bfloat16 to torch.float16. +(APIServer pid=1) INFO 04-22 00:05:59 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 00:05:59 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 00:05:59 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 00:05:59 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 00:05:59 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 00:05:59 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 00:05:59 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 00:05:59 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 00:05:59 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 00:05:59 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:06:00 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:06:00 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 00:06:03 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:06:03 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:06:03 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:06:03 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:06:03 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:06:03 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:06:03 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:06:03 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:06:03 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:06:03 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:06:03 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:06:03 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:06:08 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=243) DEBUG 04-22 00:06:10 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 00:06:10 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=243) DEBUG 04-22 00:06:10 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/fca1389a-59c4-4e9b-a75c-f1faad6c104b'], outputs=['ipc:///tmp/c00a72ab-c547-4d85-92e6-2c8582029bd6'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=243) DEBUG 04-22 00:06:10 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=243) DEBUG 04-22 00:06:10 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=243) DEBUG 04-22 00:06:10 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=243) DEBUG 04-22 00:06:10 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=243) DEBUG 04-22 00:06:10 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=243) INFO 04-22 00:06:10 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=243) DEBUG 04-22 00:06:10 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.4.186:55885 backend=nccl +(EngineCore pid=243) INFO 04-22 00:06:10 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.4.186:55885 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=243) DEBUG 04-22 00:06:10 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=243) INFO 04-22 00:06:10 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=243) DEBUG 04-22 00:06:11 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776816371.008869, auto_measure=True +(EngineCore pid=243) DEBUG 04-22 00:06:11 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=243) DEBUG 04-22 00:06:11 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=243) DEBUG 04-22 00:06:11 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=243) DEBUG 04-22 00:06:11 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=243) DEBUG 04-22 00:06:11 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=243) DEBUG 04-22 00:06:11 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=243) DEBUG 04-22 00:06:11 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=243) DEBUG 04-22 00:06:11 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=243) INFO 04-22 00:06:11 [v1/worker/gpu_model_runner.py:4735] Starting to load model meta-llama/Llama-3.1-8B-Instruct... +(EngineCore pid=243) DEBUG 04-22 00:06:11 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.float16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=243) INFO 04-22 00:06:11 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=243) INFO 04-22 00:06:11 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=243) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=243) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=243) DEBUG 04-22 00:06:11 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=243) DEBUG 04-22 00:06:11 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=243) DEBUG 04-22 00:06:11 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=243) DEBUG 04-22 00:06:11 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=243) DEBUG 04-22 00:06:12 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00003-of-00004.safetensors']] +(EngineCore pid=243) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 +(APIServer pid=1) DEBUG 04-22 00:06:20 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=243) INFO 04-22 00:06:20 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/739e065bf0/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=fd6c2a97a9 comp=e546579c48 code=d4902a9f99bbc2392c45e4b4dc801f4424a11306edb95a61851b1062db20b8c4 dir=/data/.cache/vllm/torch_compile_cache/739e065bf0/rank_0_0/backbone +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] Vllm config hash: fd6c2a97a9 +(EngineCore pid=243) INFO 04-22 00:06:20 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.22 s +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=243) DEBUG 04-22 00:06:21 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 00:06:21 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms +(EngineCore pid=243) DEBUG 04-22 00:06:21 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms +(EngineCore pid=243) DEBUG 04-22 00:06:21 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 00:06:21 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=243) INFO 04-22 00:06:23 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=243) DEBUG 04-22 00:06:23 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/739e065bf0/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=243) DEBUG 04-22 00:06:23 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 00:06:23 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(EngineCore pid=243) DEBUG 04-22 00:06:23 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.2 ms +(EngineCore pid=243) DEBUG 04-22 00:06:23 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 00:06:23 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=243) DEBUG 04-22 00:06:24 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/739e065bf0/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=243) DEBUG 04-22 00:06:25 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 00:06:25 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms +(EngineCore pid=243) DEBUG 04-22 00:06:25 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms +(EngineCore pid=243) DEBUG 04-22 00:06:25 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 00:06:25 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(EngineCore pid=243) DEBUG 04-22 00:06:26 [compilation/backends.py:377] Store the 32-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_32', '/data/.cache/vllm/torch_compile_cache/739e065bf0/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_32') +(EngineCore pid=243) INFO 04-22 00:06:26 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.04 s +(EngineCore pid=243) DEBUG 04-22 00:06:26 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/739e065bf0/rank_0_0/backbone/computation_graph.py +(EngineCore pid=243) INFO 04-22 00:06:27 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/8a96897ca6e1860a8e6b9000115979b0b0ec581428214eb12ff34604a061c971/rank_0_0/model +(EngineCore pid=243) INFO 04-22 00:06:27 [compilation/monitor.py:48] torch.compile took 10.68 s in total +(EngineCore pid=243) INFO 04-22 00:06:27 [compilation/monitor.py:76] Initial profiling/warmup run took 0.48 s +(APIServer pid=1) DEBUG 04-22 00:06:30 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=243) INFO 04-22 00:06:33 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=243) DEBUG 04-22 00:06:33 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=243) INFO 04-22 00:06:33 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=243) DEBUG 04-22 00:06:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:06:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:06:33 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 00:06:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:06:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:06:33 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 00:06:33 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 138.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=243) DEBUG 04-22 00:06:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:06:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:06:33 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 00:06:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:06:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:06:33 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 00:06:33 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 260.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=243) DEBUG 04-22 00:06:34 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=243) INFO 04-22 00:06:34 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.84 GiB total +(EngineCore pid=243) DEBUG 04-22 00:06:34 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=243) DEBUG 04-22 00:06:34 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.24 GiB (total), 58.79 GiB (within requested) +(EngineCore pid=243) DEBUG 04-22 00:06:34 [v1/worker/gpu_worker.py:435] Memory profiling takes 17.70 seconds. Total non KV cache memory: 17.12GiB; torch peak memory increase: 1.89GiB; non-torch forward increase memory: 0.25GiB; weights memory: 14.99GiB. +(EngineCore pid=243) INFO 04-22 00:06:34 [v1/worker/gpu_worker.py:436] Available KV cache memory: 58.11 GiB +(EngineCore pid=243) INFO 04-22 00:06:34 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9606 to maintain the same effective KV cache size. +(EngineCore pid=243) INFO 04-22 00:06:34 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 476,016 tokens +(EngineCore pid=243) INFO 04-22 00:06:34 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 58.11x +(EngineCore pid=243) 2026-04-22 00:06:34,298 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=243) DEBUG 04-22 00:06:34 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) 2026-04-22 00:06:34,307 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=243) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:44:53 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:44:53 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 01:44:53 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:44:53 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 01:44:53 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 01:44:53 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-3.1-8B-Instruct +(APIServer pid=1) INFO 04-22 01:44:53 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 01:44:53 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:44:53 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-3.1-8B-Instruct', 'model': 'meta-llama/Llama-3.1-8B-Instruct', 'max_model_len': 2048, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 01:44:53 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 01:44:54 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 01:44:54 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0025746 secs +(APIServer pid=1) INFO 04-22 01:44:54 [config/model.py:549] Resolved architecture: LlamaForCausalLM +(APIServer pid=1) INFO 04-22 01:44:54 [config/model.py:1678] Using max model len 2048 +(APIServer pid=1) DEBUG 04-22 01:44:54 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 01:44:54 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 01:44:54 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 01:44:54 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 01:44:54 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 01:44:54 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 01:44:54 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 01:44:54 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 01:44:54 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:44:54 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:44:54 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 01:44:58 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:44:58 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:44:58 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:44:58 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:44:58 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:44:58 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:44:58 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:44:58 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:44:58 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:44:58 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:44:58 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:44:58 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:45:03 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=243) DEBUG 04-22 01:45:04 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 01:45:04 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=243) DEBUG 04-22 01:45:04 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/7440d862-18f2-4f83-93ee-efde0faecd74'], outputs=['ipc:///tmp/f5b0c178-cd3a-4ae6-875f-cf85ae54d43b'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=243) DEBUG 04-22 01:45:04 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=243) DEBUG 04-22 01:45:04 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=243) DEBUG 04-22 01:45:04 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=243) DEBUG 04-22 01:45:04 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=243) DEBUG 04-22 01:45:04 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=243) INFO 04-22 01:45:04 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=243) DEBUG 04-22 01:45:05 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.35:48997 backend=nccl +(EngineCore pid=243) INFO 04-22 01:45:05 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.35:48997 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=243) DEBUG 04-22 01:45:05 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=243) INFO 04-22 01:45:05 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=243) DEBUG 04-22 01:45:05 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776822305.5706387, auto_measure=True +(EngineCore pid=243) DEBUG 04-22 01:45:05 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=243) DEBUG 04-22 01:45:05 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=243) DEBUG 04-22 01:45:05 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=243) DEBUG 04-22 01:45:05 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=243) DEBUG 04-22 01:45:05 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=243) DEBUG 04-22 01:45:05 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=243) DEBUG 04-22 01:45:05 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=243) DEBUG 04-22 01:45:05 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=243) INFO 04-22 01:45:05 [v1/worker/gpu_model_runner.py:4735] Starting to load model meta-llama/Llama-3.1-8B-Instruct... +(EngineCore pid=243) DEBUG 04-22 01:45:06 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=243) INFO 04-22 01:45:06 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=243) INFO 04-22 01:45:06 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=243) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=243) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=243) DEBUG 04-22 01:45:06 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=243) DEBUG 04-22 01:45:06 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=243) DEBUG 04-22 01:45:06 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=243) DEBUG 04-22 01:45:06 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=243) DEBUG 04-22 01:45:06 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00001-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00002-of-00004.safetensors']] +(EngineCore pid=243) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=243) INFO 04-22 01:45:23 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/42b409dd87/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=0252433631 comp=e546579c48 code=d4902a9f99bbc2392c45e4b4dc801f4424a11306edb95a61851b1062db20b8c4 dir=/data/.cache/vllm/torch_compile_cache/42b409dd87/rank_0_0/backbone +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] Vllm config hash: 0252433631 +(EngineCore pid=243) INFO 04-22 01:45:23 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.31 s +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=243) DEBUG 04-22 01:45:24 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 01:45:24 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(EngineCore pid=243) DEBUG 04-22 01:45:24 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms +(EngineCore pid=243) DEBUG 04-22 01:45:24 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 01:45:24 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(APIServer pid=1) DEBUG 04-22 01:45:24 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=243) INFO 04-22 01:45:26 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=243) DEBUG 04-22 01:45:26 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/42b409dd87/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=243) DEBUG 04-22 01:45:26 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 01:45:26 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(EngineCore pid=243) DEBUG 04-22 01:45:26 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms +(EngineCore pid=243) DEBUG 04-22 01:45:26 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 01:45:26 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=243) DEBUG 04-22 01:45:27 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/42b409dd87/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=243) DEBUG 04-22 01:45:28 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 01:45:28 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms +(EngineCore pid=243) DEBUG 04-22 01:45:28 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms +(EngineCore pid=243) DEBUG 04-22 01:45:28 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 01:45:28 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(EngineCore pid=243) DEBUG 04-22 01:45:28 [compilation/backends.py:377] Store the 32-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_32', '/data/.cache/vllm/torch_compile_cache/42b409dd87/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_32') +(EngineCore pid=243) INFO 04-22 01:45:28 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.21 s +(EngineCore pid=243) DEBUG 04-22 01:45:29 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/42b409dd87/rank_0_0/backbone/computation_graph.py +(EngineCore pid=243) INFO 04-22 01:45:30 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/6a79de39180cf098b701309e28d6e3149e650bda89dc5a00214c5ae0d896a5de/rank_0_0/model +(EngineCore pid=243) INFO 04-22 01:45:30 [compilation/monitor.py:48] torch.compile took 10.96 s in total +(EngineCore pid=243) INFO 04-22 01:45:30 [compilation/monitor.py:76] Initial profiling/warmup run took 0.45 s +(APIServer pid=1) DEBUG 04-22 01:45:34 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=243) INFO 04-22 01:45:36 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=243) DEBUG 04-22 01:45:36 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=243) INFO 04-22 01:45:36 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=243) DEBUG 04-22 01:45:36 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:45:36 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:45:36 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 01:45:36 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:45:36 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:45:36 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 01:45:36 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 138.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=243) DEBUG 04-22 01:45:36 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:45:36 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:45:36 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 01:45:36 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:45:36 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:45:36 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 01:45:36 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 260.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=243) DEBUG 04-22 01:45:36 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=243) INFO 04-22 01:45:36 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.84 GiB total +(EngineCore pid=243) DEBUG 04-22 01:45:37 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=243) DEBUG 04-22 01:45:37 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.26 GiB (total), 58.81 GiB (within requested) +(EngineCore pid=243) DEBUG 04-22 01:45:37 [v1/worker/gpu_worker.py:435] Memory profiling takes 17.92 seconds. Total non KV cache memory: 17.12GiB; torch peak memory increase: 1.89GiB; non-torch forward increase memory: 0.25GiB; weights memory: 14.99GiB. +(EngineCore pid=243) INFO 04-22 01:45:37 [v1/worker/gpu_worker.py:436] Available KV cache memory: 58.11 GiB +(EngineCore pid=243) INFO 04-22 01:45:37 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9606 to maintain the same effective KV cache size. +(EngineCore pid=243) INFO 04-22 01:45:37 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 476,016 tokens +(EngineCore pid=243) INFO 04-22 01:45:37 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 2,048 tokens per request: 232.43x +(EngineCore pid=243) 2026-04-22 01:45:37,202 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=243) DEBUG 04-22 01:45:37 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) 2026-04-22 01:45:37,212 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=243) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:46:04 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:46:04 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 01:46:04 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:46:04 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 01:46:04 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 01:46:04 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-3.1-8B-Instruct +(APIServer pid=1) INFO 04-22 01:46:04 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 01:46:04 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:46:04 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-3.1-8B-Instruct', 'model': 'meta-llama/Llama-3.1-8B-Instruct', 'max_model_len': 4096, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 01:46:04 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 01:46:04 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 01:46:04 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003769 secs +(APIServer pid=1) INFO 04-22 01:46:04 [config/model.py:549] Resolved architecture: LlamaForCausalLM +(APIServer pid=1) INFO 04-22 01:46:04 [config/model.py:1678] Using max model len 4096 +(APIServer pid=1) DEBUG 04-22 01:46:04 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 01:46:04 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 01:46:04 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 01:46:04 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 01:46:04 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 01:46:04 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 01:46:04 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 01:46:05 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 01:46:05 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:46:05 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:46:05 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 01:46:09 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:46:09 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:46:09 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:46:09 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:46:09 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:46:09 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:46:09 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:46:09 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:46:09 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:46:09 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:46:09 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:46:09 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:46:13 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=243) DEBUG 04-22 01:46:15 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 01:46:15 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=243) DEBUG 04-22 01:46:15 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/779c182c-4761-4a4d-9b5c-a83785404ad2'], outputs=['ipc:///tmp/afe7f37a-1eb4-4819-a4e6-d191162b838c'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=243) DEBUG 04-22 01:46:15 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=243) DEBUG 04-22 01:46:15 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=243) DEBUG 04-22 01:46:15 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=243) DEBUG 04-22 01:46:15 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=243) DEBUG 04-22 01:46:15 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=243) INFO 04-22 01:46:15 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=243) DEBUG 04-22 01:46:15 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.36:39353 backend=nccl +(EngineCore pid=243) INFO 04-22 01:46:15 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.36:39353 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=243) DEBUG 04-22 01:46:15 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=243) INFO 04-22 01:46:15 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=243) DEBUG 04-22 01:46:16 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776822376.346393, auto_measure=True +(EngineCore pid=243) DEBUG 04-22 01:46:16 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=243) DEBUG 04-22 01:46:16 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=243) DEBUG 04-22 01:46:16 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=243) DEBUG 04-22 01:46:16 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=243) DEBUG 04-22 01:46:16 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=243) DEBUG 04-22 01:46:16 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=243) DEBUG 04-22 01:46:16 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=243) DEBUG 04-22 01:46:16 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=243) INFO 04-22 01:46:16 [v1/worker/gpu_model_runner.py:4735] Starting to load model meta-llama/Llama-3.1-8B-Instruct... +(EngineCore pid=243) DEBUG 04-22 01:46:17 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=243) INFO 04-22 01:46:17 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=243) INFO 04-22 01:46:17 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=243) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=243) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=243) DEBUG 04-22 01:46:17 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=243) DEBUG 04-22 01:46:17 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=243) DEBUG 04-22 01:46:17 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=243) DEBUG 04-22 01:46:17 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=243) DEBUG 04-22 01:46:17 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00001-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00002-of-00004.safetensors', 'model-00004-of-00004.safetensors']] +(EngineCore pid=243) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 +(APIServer pid=1) DEBUG 04-22 01:46:25 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=243) INFO 04-22 01:46:29 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/a3ffc11007/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=1b824e05d5 comp=e546579c48 code=d4902a9f99bbc2392c45e4b4dc801f4424a11306edb95a61851b1062db20b8c4 dir=/data/.cache/vllm/torch_compile_cache/a3ffc11007/rank_0_0/backbone +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] Vllm config hash: 1b824e05d5 +(EngineCore pid=243) INFO 04-22 01:46:29 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.35 s +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=243) INFO 04-22 01:46:31 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=243) DEBUG 04-22 01:46:31 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/a3ffc11007/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=243) DEBUG 04-22 01:46:31 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 01:46:31 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(EngineCore pid=243) DEBUG 04-22 01:46:31 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms +(EngineCore pid=243) DEBUG 04-22 01:46:31 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 01:46:31 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=243) DEBUG 04-22 01:46:33 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/a3ffc11007/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=243) DEBUG 04-22 01:46:34 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 01:46:34 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms +(EngineCore pid=243) DEBUG 04-22 01:46:34 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms +(EngineCore pid=243) DEBUG 04-22 01:46:34 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 01:46:34 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(EngineCore pid=243) DEBUG 04-22 01:46:34 [compilation/backends.py:377] Store the 32-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_32', '/data/.cache/vllm/torch_compile_cache/a3ffc11007/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_32') +(EngineCore pid=243) INFO 04-22 01:46:34 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.19 s +(EngineCore pid=243) DEBUG 04-22 01:46:34 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/a3ffc11007/rank_0_0/backbone/computation_graph.py +(APIServer pid=1) DEBUG 04-22 01:46:35 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=243) INFO 04-22 01:46:35 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/2dde39181b04ca928d91381047e0719bd31f31ec1f42285a0f371726d4389ad4/rank_0_0/model +(EngineCore pid=243) INFO 04-22 01:46:35 [compilation/monitor.py:48] torch.compile took 10.97 s in total +(EngineCore pid=243) INFO 04-22 01:46:36 [compilation/monitor.py:76] Initial profiling/warmup run took 0.45 s +(EngineCore pid=243) INFO 04-22 01:46:41 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=243) DEBUG 04-22 01:46:41 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=243) INFO 04-22 01:46:41 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=243) DEBUG 04-22 01:46:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:46:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:46:41 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 01:46:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:46:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:46:41 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 01:46:41 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 138.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=243) DEBUG 04-22 01:46:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:46:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:46:41 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 01:46:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:46:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:46:41 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 01:46:41 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 260.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=243) DEBUG 04-22 01:46:42 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=243) INFO 04-22 01:46:42 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.84 GiB total +(EngineCore pid=243) DEBUG 04-22 01:46:42 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=243) DEBUG 04-22 01:46:42 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.26 GiB (total), 58.81 GiB (within requested) +(EngineCore pid=243) DEBUG 04-22 01:46:42 [v1/worker/gpu_worker.py:435] Memory profiling takes 17.89 seconds. Total non KV cache memory: 17.12GiB; torch peak memory increase: 1.89GiB; non-torch forward increase memory: 0.25GiB; weights memory: 14.99GiB. +(EngineCore pid=243) INFO 04-22 01:46:42 [v1/worker/gpu_worker.py:436] Available KV cache memory: 58.11 GiB +(EngineCore pid=243) INFO 04-22 01:46:42 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9606 to maintain the same effective KV cache size. +(EngineCore pid=243) INFO 04-22 01:46:42 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 476,016 tokens +(EngineCore pid=243) INFO 04-22 01:46:42 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 4,096 tokens per request: 116.21x +(EngineCore pid=243) 2026-04-22 01:46:42,654 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=243) DEBUG 04-22 01:46:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) 2026-04-22 01:46:42,663 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=243) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:03:35 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:03:35 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 00:03:35 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:03:35 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 00:03:35 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 00:03:35 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-3.1-8B-Instruct +(APIServer pid=1) INFO 04-22 00:03:35 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 00:03:35 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:03:35 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-3.1-8B-Instruct', 'model': 'meta-llama/Llama-3.1-8B-Instruct', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 00:03:35 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 00:03:36 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 00:03:36 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0012799 secs +(APIServer pid=1) INFO 04-22 00:03:36 [config/model.py:549] Resolved architecture: LlamaForCausalLM +(APIServer pid=1) INFO 04-22 00:03:36 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 00:03:36 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 00:03:36 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 00:03:36 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 00:03:36 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 00:03:36 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 00:03:36 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 00:03:36 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 00:03:36 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 00:03:36 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:03:36 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:03:36 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 00:03:40 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:03:40 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:03:40 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:03:40 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:03:40 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:03:40 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:03:40 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:03:40 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:03:40 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:03:40 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:03:40 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:03:40 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:03:45 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=242) DEBUG 04-22 00:03:46 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 00:03:46 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=242) DEBUG 04-22 00:03:46 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/a6ae8f34-d228-453e-891a-3092bd7b0a4e'], outputs=['ipc:///tmp/af9ad280-429e-48b4-bc4c-a51022de6971'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=242) DEBUG 04-22 00:03:46 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=242) DEBUG 04-22 00:03:46 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=242) DEBUG 04-22 00:03:46 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=242) DEBUG 04-22 00:03:46 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=242) DEBUG 04-22 00:03:46 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=242) INFO 04-22 00:03:46 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=242) DEBUG 04-22 00:03:47 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.4.185:60191 backend=nccl +(EngineCore pid=242) INFO 04-22 00:03:47 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.4.185:60191 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=242) DEBUG 04-22 00:03:47 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=242) INFO 04-22 00:03:47 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=242) DEBUG 04-22 00:03:47 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776816227.5468106, auto_measure=True +(EngineCore pid=242) DEBUG 04-22 00:03:47 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=242) DEBUG 04-22 00:03:47 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=242) DEBUG 04-22 00:03:47 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=242) DEBUG 04-22 00:03:47 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=242) DEBUG 04-22 00:03:47 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=242) DEBUG 04-22 00:03:47 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=242) DEBUG 04-22 00:03:47 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=242) DEBUG 04-22 00:03:47 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=242) INFO 04-22 00:03:47 [v1/worker/gpu_model_runner.py:4735] Starting to load model meta-llama/Llama-3.1-8B-Instruct... +(EngineCore pid=242) DEBUG 04-22 00:03:48 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=242) INFO 04-22 00:03:48 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=242) INFO 04-22 00:03:48 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=242) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=242) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=242) DEBUG 04-22 00:03:48 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=242) DEBUG 04-22 00:03:48 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=242) DEBUG 04-22 00:03:48 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=242) DEBUG 04-22 00:03:48 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=242) DEBUG 04-22 00:03:48 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'model-00004-of-00004.safetensors']] +(EngineCore pid=242) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 +(APIServer pid=1) DEBUG 04-22 00:04:06 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=242) INFO 04-22 00:04:07 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/b9570f3049/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=5e2afef975 comp=e546579c48 code=d4902a9f99bbc2392c45e4b4dc801f4424a11306edb95a61851b1062db20b8c4 dir=/data/.cache/vllm/torch_compile_cache/b9570f3049/rank_0_0/backbone +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] Vllm config hash: 5e2afef975 +(EngineCore pid=242) INFO 04-22 00:04:07 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.29 s +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=242) DEBUG 04-22 00:04:08 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=242) DEBUG 04-22 00:04:08 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms +(EngineCore pid=242) DEBUG 04-22 00:04:08 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.3 ms +(EngineCore pid=242) DEBUG 04-22 00:04:08 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=242) DEBUG 04-22 00:04:08 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=242) INFO 04-22 00:04:09 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=242) DEBUG 04-22 00:04:09 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/b9570f3049/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=242) DEBUG 04-22 00:04:10 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=242) DEBUG 04-22 00:04:10 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(EngineCore pid=242) DEBUG 04-22 00:04:10 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms +(EngineCore pid=242) DEBUG 04-22 00:04:10 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=242) DEBUG 04-22 00:04:10 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=242) DEBUG 04-22 00:04:11 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/b9570f3049/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=242) DEBUG 04-22 00:04:12 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=242) DEBUG 04-22 00:04:12 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms +(EngineCore pid=242) DEBUG 04-22 00:04:12 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms +(EngineCore pid=242) DEBUG 04-22 00:04:12 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=242) DEBUG 04-22 00:04:12 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(EngineCore pid=242) DEBUG 04-22 00:04:12 [compilation/backends.py:377] Store the 32-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_32', '/data/.cache/vllm/torch_compile_cache/b9570f3049/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_32') +(EngineCore pid=242) INFO 04-22 00:04:12 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.13 s +(EngineCore pid=242) DEBUG 04-22 00:04:12 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/b9570f3049/rank_0_0/backbone/computation_graph.py +(EngineCore pid=242) INFO 04-22 00:04:13 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/346f76857706c0537d5aef76aa2af17185a22023d838afba9a1d18619c755486/rank_0_0/model +(EngineCore pid=242) INFO 04-22 00:04:13 [compilation/monitor.py:48] torch.compile took 10.84 s in total +(EngineCore pid=242) INFO 04-22 00:04:14 [compilation/monitor.py:76] Initial profiling/warmup run took 0.46 s +(APIServer pid=1) DEBUG 04-22 00:04:16 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=242) INFO 04-22 00:04:19 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=242) DEBUG 04-22 00:04:19 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=242) INFO 04-22 00:04:19 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=242) DEBUG 04-22 00:04:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=242) DEBUG 04-22 00:04:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=242) DEBUG 04-22 00:04:20 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=242) DEBUG 04-22 00:04:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=242) DEBUG 04-22 00:04:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=242) DEBUG 04-22 00:04:20 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=242) DEBUG 04-22 00:04:20 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 138.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=242) DEBUG 04-22 00:04:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=242) DEBUG 04-22 00:04:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=242) DEBUG 04-22 00:04:20 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=242) DEBUG 04-22 00:04:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=242) DEBUG 04-22 00:04:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=242) DEBUG 04-22 00:04:20 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=242) DEBUG 04-22 00:04:20 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 260.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=242) DEBUG 04-22 00:04:20 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=242) INFO 04-22 00:04:20 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.84 GiB total +(EngineCore pid=242) DEBUG 04-22 00:04:20 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=242) DEBUG 04-22 00:04:20 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.24 GiB (total), 58.79 GiB (within requested) +(EngineCore pid=242) DEBUG 04-22 00:04:20 [v1/worker/gpu_worker.py:435] Memory profiling takes 17.96 seconds. Total non KV cache memory: 17.12GiB; torch peak memory increase: 1.89GiB; non-torch forward increase memory: 0.25GiB; weights memory: 14.99GiB. +(EngineCore pid=242) INFO 04-22 00:04:20 [v1/worker/gpu_worker.py:436] Available KV cache memory: 58.11 GiB +(EngineCore pid=242) INFO 04-22 00:04:20 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9606 to maintain the same effective KV cache size. +(EngineCore pid=242) INFO 04-22 00:04:20 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 476,016 tokens +(EngineCore pid=242) INFO 04-22 00:04:20 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 58.11x +(EngineCore pid=242) 2026-04-22 00:04:20,974 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=242) DEBUG 04-22 00:04:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=242) 2026-04-22 00:04:20,983 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=242) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:41:28 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:41:28 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +INFO 04-22 01:41:28 [entrypoints/cli/serve.py:101] Defaulting api_server_count to data_parallel_size (2). +DEBUG 04-22 01:41:28 [v1/metrics/prometheus.py:27] Created PROMETHEUS_MULTIPROC_DIR at /tmp/tmphdhy4way +INFO 04-22 01:41:28 [entrypoints/utils.py:299] +INFO 04-22 01:41:28 [entrypoints/utils.py:299] █ █ █▄ ▄█ +INFO 04-22 01:41:28 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +INFO 04-22 01:41:28 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-3.1-8B-Instruct +INFO 04-22 01:41:28 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +INFO 04-22 01:41:28 [entrypoints/utils.py:299] +INFO 04-22 01:41:28 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-3.1-8B-Instruct', 'api_server_count': 2, 'model': 'meta-llama/Llama-3.1-8B-Instruct', 'max_model_len': 8192, 'data_parallel_size': 2, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +WARNING 04-22 01:41:28 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +DEBUG 04-22 01:41:28 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache +DEBUG 04-22 01:41:28 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0013756 secs +INFO 04-22 01:41:28 [config/model.py:549] Resolved architecture: LlamaForCausalLM +INFO 04-22 01:41:28 [config/model.py:1678] Using max model len 8192 +DEBUG 04-22 01:41:28 [config/model.py:1743] Generative models support chunked prefill. +DEBUG 04-22 01:41:28 [config/model.py:1801] Generative models support prefix caching. +DEBUG 04-22 01:41:28 [engine/arg_utils.py:2116] Enabling chunked prefill by default +DEBUG 04-22 01:41:28 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +DEBUG 04-22 01:41:28 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +DEBUG 04-22 01:41:28 [config/parallel.py:743] Defaulting to use mp for distributed inference +INFO 04-22 01:41:28 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +INFO 04-22 01:41:28 [config/vllm.py:790] Asynchronous scheduling is enabled. +DEBUG 04-22 01:41:32 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:41:32 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:41:32 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:41:32 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:41:32 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:41:32 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:41:32 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:41:32 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:41:32 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:41:32 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:41:32 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:41:32 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:41:37 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +INFO 04-22 01:41:38 [v1/engine/utils.py:914] Started DP Coordinator process (PID: 239) +INFO 04-22 01:41:38 [v1/utils.py:223] Started 2 API server processes +DEBUG 04-22 01:41:41 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:41:41 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:41:41 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:41:41 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:41:41 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:41:41 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:41:41 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:41:41 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:41:41 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:41:41 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:41:41 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:41:41 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:41:41 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:41:41 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:41:41 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:41:41 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:41:41 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:41:41 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:41:41 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:41:41 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:41:41 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:41:41 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:41:41 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:41:41 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:41:41 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:41:41 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:41:41 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:41:41 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:41:41 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:41:41 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:41:41 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:41:41 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:41:41 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:41:41 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:41:41 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:41:41 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:41:41 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:41:41 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:41:41 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:41:41 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:41:41 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:41:41 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:41:41 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:41:41 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:41:41 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:41:41 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:41:41 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:41:41 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:41:46 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:41:46 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:41:46 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:41:46 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore_DP0 pid=437) DEBUG 04-22 01:41:48 [v1/engine/core.py:1018] Waiting for init message from front-end. +DEBUG 04-22 01:41:48 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore_DP0 pid=437) DEBUG 04-22 01:41:48 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/79346ad6-6e10-42b0-95ff-725d8af58318', 'ipc:///tmp/76f7bc9d-9737-4b5a-849d-e58f7f34ac76'], outputs=['ipc:///tmp/739e9588-9318-470d-8b78-7c2a18de5ffa', 'ipc:///tmp/5b6feb5c-2bf1-4aa0-be99-632876126031'], coordinator_input='ipc:///tmp/45db5870-0521-4512-b1ea-dbb2fb7083e8', coordinator_output='ipc:///tmp/4b686f6c-d679-4419-bc33-a3231d0b59e6', frontend_stats_publish_address='ipc:///tmp/f21066c4-2444-497d-ae13-c1dd93d6f752'), parallel_config={}) +(EngineCore_DP0 pid=437) DEBUG 04-22 01:41:48 [v1/engine/core.py:826] Has DP Coordinator: True, stats publish address: ipc:///tmp/f21066c4-2444-497d-ae13-c1dd93d6f752 +(EngineCore_DP0 pid=437) DEBUG 04-22 01:41:48 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore_DP0 pid=437) DEBUG 04-22 01:41:48 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore_DP0 pid=437) DEBUG 04-22 01:41:48 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore_DP0 pid=437) DEBUG 04-22 01:41:48 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore_DP0 pid=437) INFO 04-22 01:41:48 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore_DP0 pid=437) WARNING 04-22 01:41:48 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. +(EngineCore_DP0 pid=437) INFO 04-22 01:41:48 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.129.9.34 (local), world_size=1, local_world_size=1 +(EngineCore_DP0 pid=437) DEBUG 04-22 01:41:48 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/65e715b1-9a8f-41c5-9ec9-075a5689f469 +(EngineCore_DP0 pid=437) DEBUG 04-22 01:41:48 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 16777216, 10, 'psm_3dd4e55a'), local_subscribe_addr='ipc:///tmp/65e715b1-9a8f-41c5-9ec9-075a5689f469', local_notify_addr='ipc:///tmp/ac8cd9fe-f882-4792-b609-73f8cc090691', remote_subscribe_addr=None, remote_addr_ipv6=False) +(EngineCore_DP1 pid=438) DEBUG 04-22 01:41:48 [v1/engine/core.py:1018] Waiting for init message from front-end. +DEBUG 04-22 01:41:48 [v1/engine/utils.py:1158] HELLO from local core engine process 1. +(EngineCore_DP1 pid=438) DEBUG 04-22 01:41:48 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/79346ad6-6e10-42b0-95ff-725d8af58318', 'ipc:///tmp/76f7bc9d-9737-4b5a-849d-e58f7f34ac76'], outputs=['ipc:///tmp/739e9588-9318-470d-8b78-7c2a18de5ffa', 'ipc:///tmp/5b6feb5c-2bf1-4aa0-be99-632876126031'], coordinator_input='ipc:///tmp/45db5870-0521-4512-b1ea-dbb2fb7083e8', coordinator_output='ipc:///tmp/4b686f6c-d679-4419-bc33-a3231d0b59e6', frontend_stats_publish_address='ipc:///tmp/f21066c4-2444-497d-ae13-c1dd93d6f752'), parallel_config={}) +(EngineCore_DP1 pid=438) DEBUG 04-22 01:41:48 [v1/engine/core.py:826] Has DP Coordinator: True, stats publish address: ipc:///tmp/f21066c4-2444-497d-ae13-c1dd93d6f752 +(EngineCore_DP1 pid=438) DEBUG 04-22 01:41:48 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore_DP1 pid=438) DEBUG 04-22 01:41:48 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore_DP1 pid=438) DEBUG 04-22 01:41:48 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore_DP1 pid=438) DEBUG 04-22 01:41:48 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore_DP1 pid=438) WARNING 04-22 01:41:48 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. +(EngineCore_DP1 pid=438) INFO 04-22 01:41:48 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.129.9.34 (local), world_size=1, local_world_size=1 +(EngineCore_DP1 pid=438) DEBUG 04-22 01:41:48 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/f5efeb96-b1c1-4784-a3f8-e05ffaf8af6c +(EngineCore_DP1 pid=438) DEBUG 04-22 01:41:48 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 16777216, 10, 'psm_b418ed59'), local_subscribe_addr='ipc:///tmp/f5efeb96-b1c1-4784-a3f8-e05ffaf8af6c', local_notify_addr='ipc:///tmp/742ab49c-d494-48b1-b8cd-3d76bc9e2181', remote_subscribe_addr=None, remote_addr_ipv6=False) +(ApiServer_1 pid=440) DEBUG 04-22 01:41:48 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(ApiServer_1 pid=440) DEBUG 04-22 01:41:48 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(ApiServer_1 pid=440) DEBUG 04-22 01:41:48 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(ApiServer_1 pid=440) DEBUG 04-22 01:41:48 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(ApiServer_1 pid=440) WARNING 04-22 01:41:48 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(ApiServer_0 pid=439) DEBUG 04-22 01:41:48 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(ApiServer_0 pid=439) DEBUG 04-22 01:41:48 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(ApiServer_0 pid=439) DEBUG 04-22 01:41:48 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(ApiServer_0 pid=439) DEBUG 04-22 01:41:48 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(ApiServer_0 pid=439) WARNING 04-22 01:41:48 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(ApiServer_0 pid=439) DEBUG 04-22 01:41:48 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache +(ApiServer_0 pid=439) DEBUG 04-22 01:41:48 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0005344 secs +(ApiServer_0 pid=439) INFO 04-22 01:41:48 [config/model.py:549] Resolved architecture: LlamaForCausalLM +(ApiServer_0 pid=439) INFO 04-22 01:41:48 [config/model.py:1678] Using max model len 8192 +(ApiServer_0 pid=439) DEBUG 04-22 01:41:48 [config/model.py:1743] Generative models support chunked prefill. +(ApiServer_0 pid=439) DEBUG 04-22 01:41:48 [config/model.py:1801] Generative models support prefix caching. +(ApiServer_0 pid=439) DEBUG 04-22 01:41:48 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(ApiServer_0 pid=439) DEBUG 04-22 01:41:48 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(ApiServer_0 pid=439) DEBUG 04-22 01:41:48 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(ApiServer_0 pid=439) DEBUG 04-22 01:41:48 [config/parallel.py:743] Defaulting to use mp for distributed inference +(ApiServer_0 pid=439) INFO 04-22 01:41:48 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(ApiServer_0 pid=439) INFO 04-22 01:41:48 [config/vllm.py:790] Asynchronous scheduling is enabled. +(ApiServer_1 pid=440) DEBUG 04-22 01:41:48 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache +(ApiServer_1 pid=440) DEBUG 04-22 01:41:48 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0004159 secs +(ApiServer_1 pid=440) INFO 04-22 01:41:48 [config/model.py:549] Resolved architecture: LlamaForCausalLM +(ApiServer_1 pid=440) INFO 04-22 01:41:48 [config/model.py:1678] Using max model len 8192 +(ApiServer_1 pid=440) DEBUG 04-22 01:41:48 [config/model.py:1743] Generative models support chunked prefill. +(ApiServer_1 pid=440) DEBUG 04-22 01:41:48 [config/model.py:1801] Generative models support prefix caching. +(ApiServer_1 pid=440) DEBUG 04-22 01:41:48 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(ApiServer_1 pid=440) DEBUG 04-22 01:41:48 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(ApiServer_1 pid=440) DEBUG 04-22 01:41:48 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(ApiServer_1 pid=440) DEBUG 04-22 01:41:48 [config/parallel.py:743] Defaulting to use mp for distributed inference +(ApiServer_1 pid=440) INFO 04-22 01:41:48 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(ApiServer_1 pid=440) INFO 04-22 01:41:48 [config/vllm.py:790] Asynchronous scheduling is enabled. +(ApiServer_0 pid=439) DEBUG 04-22 01:41:48 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(ApiServer_0 pid=439) DEBUG 04-22 01:41:48 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(ApiServer_1 pid=440) DEBUG 04-22 01:41:49 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(ApiServer_1 pid=440) DEBUG 04-22 01:41:49 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(ApiServer_0 pid=439) DEBUG 04-22 01:41:49 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(ApiServer_0 pid=439) DEBUG 04-22 01:41:49 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +(ApiServer_1 pid=440) DEBUG 04-22 01:41:49 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(ApiServer_1 pid=440) DEBUG 04-22 01:41:49 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 01:41:51 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:41:51 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:41:51 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:41:51 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:41:51 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:41:51 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:41:51 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:41:51 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:41:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:41:51 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:41:51 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:41:51 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:41:51 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:41:51 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:41:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:41:51 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:41:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:41:51 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:41:51 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:41:51 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:41:51 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:41:51 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:41:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:41:51 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:41:56 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:41:56 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:41:57 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:41:57 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:41:57 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:41:57 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 01:41:57 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:41:57 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:41:57 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:41:57 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 01:41:58 [v1/engine/utils.py:1047] Waiting for 2 local, 0 remote core engine proc(s) to start. +(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] WorkerProc failed to start. +(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] Traceback (most recent call last): +(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 826, in worker_main +(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] worker = WorkerProc(*args, **kwargs) +(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) +(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ +(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 605, in __init__ +(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] self.worker.init_device() +(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 312, in init_device +(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] self.worker.init_device() # type: ignore +(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) +(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ +(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 242, in init_device +(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] assert self.local_rank < torch.accelerator.device_count(), ( +(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] AssertionError: DP adjusted local rank 1 is out of bounds. +(EngineCore_DP1 pid=438) DEBUG 04-22 01:41:58 [v1/executor/multiproc_executor.py:419] Worker Termination: allow workers to gracefully shutdown +(Worker pid=1219) DEBUG 04-22 01:41:58 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:33047 backend=nccl +(Worker pid=1219) INFO 04-22 01:41:58 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:33047 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(Worker pid=1219) DEBUG 04-22 01:41:58 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(Worker pid=1219) INFO 04-22 01:41:58 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(Worker pid=1219) DEBUG 04-22 01:41:58 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776822118.8622458, auto_measure=True +(Worker pid=1219) DEBUG 04-22 01:41:58 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=1219) DEBUG 04-22 01:41:58 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=1219) DEBUG 04-22 01:41:58 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=1219) DEBUG 04-22 01:41:58 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=1219) DEBUG 04-22 01:41:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=1219) DEBUG 04-22 01:41:59 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(Worker pid=1219) DEBUG 04-22 01:41:59 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=1219) DEBUG 04-22 01:41:59 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(Worker pid=1219) INFO 04-22 01:41:59 [v1/worker/gpu_model_runner.py:4735] Starting to load model meta-llama/Llama-3.1-8B-Instruct... +(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] EngineCore failed to start. +(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] Traceback (most recent call last): +(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1082, in run_engine_core +(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs) +(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] return func(*args, **kwargs) +(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 848, in __init__ +(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] super().__init__( +(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 114, in __init__ +(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] self.model_executor = executor_class(vllm_config) +(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 101, in __init__ +(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] super().__init__(vllm_config) +(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] return func(*args, **kwargs) +(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 103, in __init__ +(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] self._init_executor() +(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 190, in _init_executor +(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] self.workers = WorkerProc.wait_for_ready(unready_workers) +(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 736, in wait_for_ready +(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] raise e from None +(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] Exception: WorkerProc initialization failed due to an exception in a background process. See stack trace for root cause. +(EngineCore_DP1 pid=438) Process EngineCore_DP1: +(EngineCore_DP1 pid=438) Traceback (most recent call last): +(EngineCore_DP1 pid=438) File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap +(EngineCore_DP1 pid=438) self.run() +(EngineCore_DP1 pid=438) File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run +(EngineCore_DP1 pid=438) self._target(*self._args, **self._kwargs) +(EngineCore_DP1 pid=438) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1112, in run_engine_core +(EngineCore_DP1 pid=438) raise e +(EngineCore_DP1 pid=438) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1082, in run_engine_core +(EngineCore_DP1 pid=438) engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs) +(EngineCore_DP1 pid=438) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP1 pid=438) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(EngineCore_DP1 pid=438) return func(*args, **kwargs) +(EngineCore_DP1 pid=438) ^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP1 pid=438) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 848, in __init__ +(EngineCore_DP1 pid=438) super().__init__( +(EngineCore_DP1 pid=438) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 114, in __init__ +(EngineCore_DP1 pid=438) self.model_executor = executor_class(vllm_config) +(EngineCore_DP1 pid=438) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP1 pid=438) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 101, in __init__ +(EngineCore_DP1 pid=438) super().__init__(vllm_config) +(EngineCore_DP1 pid=438) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(EngineCore_DP1 pid=438) return func(*args, **kwargs) +(EngineCore_DP1 pid=438) ^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP1 pid=438) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 103, in __init__ +(EngineCore_DP1 pid=438) self._init_executor() +(EngineCore_DP1 pid=438) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 190, in _init_executor +(EngineCore_DP1 pid=438) self.workers = WorkerProc.wait_for_ready(unready_workers) +(EngineCore_DP1 pid=438) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore_DP1 pid=438) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 736, in wait_for_ready +(EngineCore_DP1 pid=438) raise e from None +(EngineCore_DP1 pid=438) Exception: WorkerProc initialization failed due to an exception in a background process. See stack trace for root cause. +(EngineCore_DP1 pid=438) DEBUG 04-22 01:41:59 [v1/executor/multiproc_executor.py:438] Triggering shutdown of workers +(Worker pid=1219) DEBUG 04-22 01:41:59 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(Worker pid=1219) INFO 04-22 01:41:59 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(Worker pid=1219) INFO 04-22 01:41:59 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(Worker pid=1219) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=1219) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=1219) DEBUG 04-22 01:41:59 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker pid=1219) DEBUG 04-22 01:41:59 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker pid=1219) DEBUG 04-22 01:41:59 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker pid=1219) DEBUG 04-22 01:41:59 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +Traceback (most recent call last): + File "/usr/local/bin/vllm", line 10, in + sys.exit(main()) + ^^^^^^ + File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main + args.dispatch_function(args) + File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 118, in cmd + run_multi_api_server(args) + File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 286, in run_multi_api_server + with launch_core_engines( + ^^^^^^^^^^^^^^^^^^^^ + File "/usr/lib/python3.12/contextlib.py", line 144, in __exit__ + next(self.gen) + File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 998, in launch_core_engines + wait_for_engine_startup( + File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 1057, in wait_for_engine_startup + raise RuntimeError( +RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} +(Worker pid=1219) DEBUG 04-22 01:42:00 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00003-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00002-of-00004.safetensors']] +(Worker pid=1219) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:42:32 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:42:32 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 01:42:32 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:42:32 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 01:42:32 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 01:42:32 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-3.1-8B-Instruct +(APIServer pid=1) INFO 04-22 01:42:32 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 01:42:32 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:42:32 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-3.1-8B-Instruct', 'model': 'meta-llama/Llama-3.1-8B-Instruct', 'max_model_len': 8192, 'pipeline_parallel_size': 2, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 01:42:32 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 01:42:33 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 01:42:33 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003599 secs +(APIServer pid=1) INFO 04-22 01:42:33 [config/model.py:549] Resolved architecture: LlamaForCausalLM +(APIServer pid=1) INFO 04-22 01:42:33 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 01:42:33 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 01:42:33 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 01:42:33 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 01:42:33 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 01:42:33 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 01:42:33 [config/parallel.py:743] Defaulting to use mp for distributed inference +(APIServer pid=1) INFO 04-22 01:42:33 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 01:42:33 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 01:42:33 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 01:42:33 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:42:33 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:42:33 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 01:42:37 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:42:37 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:42:37 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:42:37 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:42:37 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:42:37 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:42:37 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:42:37 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:42:37 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:42:37 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:42:37 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:42:37 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:42:42 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=243) DEBUG 04-22 01:42:43 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 01:42:43 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=243) DEBUG 04-22 01:42:43 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/6b7dad8b-6920-4893-bea2-725a72500b35'], outputs=['ipc:///tmp/dbf6128c-0c73-4293-9574-415241db12e2'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=243) DEBUG 04-22 01:42:43 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=243) DEBUG 04-22 01:42:43 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=243) DEBUG 04-22 01:42:43 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=243) DEBUG 04-22 01:42:43 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=243) DEBUG 04-22 01:42:43 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=243) INFO 04-22 01:42:43 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=2, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=243) WARNING 04-22 01:42:43 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. +(EngineCore pid=243) INFO 04-22 01:42:43 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.131.3.125 (local), world_size=2, local_world_size=2 +(EngineCore pid=243) DEBUG 04-22 01:42:43 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/4ada89ba-6925-4164-89e6-3295f56a409c +(EngineCore pid=243) DEBUG 04-22 01:42:43 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1], buffer_handle=(2, 16777216, 10, 'psm_c5cfa3c7'), local_subscribe_addr='ipc:///tmp/4ada89ba-6925-4164-89e6-3295f56a409c', local_notify_addr='ipc:///tmp/34fbc80b-d2fa-4999-8433-0eeed4ebbc91', remote_subscribe_addr=None, remote_addr_ipv6=False) +DEBUG 04-22 01:42:47 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:42:47 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:42:47 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:42:47 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:42:47 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:42:47 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:42:47 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:42:47 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:42:47 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:42:47 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:42:47 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:42:47 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:42:47 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:42:47 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:42:47 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:42:47 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:42:47 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:42:47 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:42:47 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:42:47 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:42:47 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:42:47 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:42:47 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:42:47 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:42:51 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:42:52 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:42:53 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:42:53 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:42:53 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:42:53 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) DEBUG 04-22 01:42:53 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +DEBUG 04-22 01:42:53 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:42:53 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:42:53 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:42:53 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(Worker pid=442) DEBUG 04-22 01:42:54 [distributed/parallel_state.py:1356] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:55875 backend=nccl +(Worker pid=442) INFO 04-22 01:42:54 [distributed/parallel_state.py:1400] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:55875 backend=nccl +(Worker pid=443) DEBUG 04-22 01:42:54 [distributed/parallel_state.py:1356] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:55875 backend=nccl +(Worker pid=443) INFO 04-22 01:42:54 [distributed/parallel_state.py:1400] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:55875 backend=nccl +[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +(Worker pid=442) DEBUG 04-22 01:42:54 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=443) DEBUG 04-22 01:42:54 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +(Worker pid=443) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=443) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=442) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=442) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=442) DEBUG 04-22 01:42:54 [utils/nccl.py:34] Found nccl from library libnccl.so.2 +(Worker pid=442) INFO 04-22 01:42:54 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(Worker pid=442) INFO 04-22 01:42:55 [distributed/parallel_state.py:1716] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(Worker pid=442) DEBUG 04-22 01:42:55 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=77.76GiB, total_memory=79.19GiB, cuda_memory=1.43GiB, torch_memory=0.0GiB, non_torch_memory=1.43GiB, timestamp=1776822175.437332, auto_measure=True +(Worker pid=442) DEBUG 04-22 01:42:55 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=443) DEBUG 04-22 01:42:55 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=77.76GiB, total_memory=79.19GiB, cuda_memory=1.43GiB, torch_memory=0.0GiB, non_torch_memory=1.43GiB, timestamp=1776822175.4582114, auto_measure=True +(Worker pid=443) DEBUG 04-22 01:42:55 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=442) DEBUG 04-22 01:42:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=442) DEBUG 04-22 01:42:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=442) DEBUG 04-22 01:42:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=443) DEBUG 04-22 01:42:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=443) DEBUG 04-22 01:42:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=443) DEBUG 04-22 01:42:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=442) DEBUG 04-22 01:42:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=442) DEBUG 04-22 01:42:55 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(Worker pid=442) DEBUG 04-22 01:42:55 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=443) DEBUG 04-22 01:42:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=443) DEBUG 04-22 01:42:55 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=442) DEBUG 04-22 01:42:55 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(Worker_PP0 pid=442) INFO 04-22 01:42:55 [v1/worker/gpu_model_runner.py:4735] Starting to load model meta-llama/Llama-3.1-8B-Instruct... +(Worker_PP0 pid=442) DEBUG 04-22 01:42:55 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(Worker_PP0 pid=442) INFO 04-22 01:42:55 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(Worker_PP0 pid=442) INFO 04-22 01:42:55 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(Worker_PP1 pid=443) DEBUG 04-22 01:42:56 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_PP1 pid=443) DEBUG 04-22 01:42:56 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_PP1 pid=443) DEBUG 04-22 01:42:56 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 33, 'silu_and_mul': 16, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_PP1 pid=443) DEBUG 04-22 01:42:56 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_PP0 pid=442) DEBUG 04-22 01:42:56 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_PP0 pid=442) DEBUG 04-22 01:42:56 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_PP0 pid=442) DEBUG 04-22 01:42:56 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 32, 'silu_and_mul': 16, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1}) +(Worker_PP0 pid=442) DEBUG 04-22 01:42:56 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_PP1 pid=443) DEBUG 04-22 01:42:56 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00004-of-00004.safetensors']] +(Worker_PP0 pid=442) DEBUG 04-22 01:42:56 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00004-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'model-00002-of-00004.safetensors']] +(Worker_PP0 pid=442) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 +(Worker_PP1 pid=443) DEBUG 04-22 01:43:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP1 pid=443) DEBUG 04-22 01:43:00 [compilation/decorators.py:528] Start compiling function +(EngineCore pid=243) DEBUG 04-22 01:43:01 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/sequence.py +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=f2b27d324e comp=e546579c48 code=7f2f48347ccdde3394423985176b64bbf478231b0e1845a35438330041d319b6 dir=/data/.cache/vllm/torch_compile_cache/639e59bee9/rank_1_0/backbone +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] Vllm config hash: f2b27d324e +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/sequence.py +(Worker_PP0 pid=442) INFO 04-22 01:43:02 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/6749c2ea5a/rank_0_0/backbone for vLLM's torch.compile +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=fc2500db0a comp=e546579c48 code=7f24f63a7548e6fdf64e57e8c7d52119b84b1d43a0b72f3e4a2b71446193f817 dir=/data/.cache/vllm/torch_compile_cache/6749c2ea5a/rank_0_0/backbone +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] Vllm config hash: fc2500db0a +(Worker_PP0 pid=442) INFO 04-22 01:43:02 [compilation/backends.py:1111] Dynamo bytecode transform time: 2.48 s +(Worker_PP0 pid=442) DEBUG 04-22 01:43:03 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(Worker_PP0 pid=442) DEBUG 04-22 01:43:03 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(APIServer pid=1) DEBUG 04-22 01:43:03 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_PP1 pid=443) DEBUG 04-22 01:43:03 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_PP1 pid=443) DEBUG 04-22 01:43:03 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_PP1 pid=443) DEBUG 04-22 01:43:03 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_PP1 pid=443) DEBUG 04-22 01:43:03 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_PP1 pid=443) DEBUG 04-22 01:43:03 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_PP0 pid=442) DEBUG 04-22 01:43:03 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_PP0 pid=442) DEBUG 04-22 01:43:03 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_PP0 pid=442) DEBUG 04-22 01:43:03 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_PP0 pid=442) DEBUG 04-22 01:43:03 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_PP0 pid=442) DEBUG 04-22 01:43:03 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_PP0 pid=442) INFO 04-22 01:43:05 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(Worker_PP0 pid=442) DEBUG 04-22 01:43:05 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/6749c2ea5a/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(Worker_PP1 pid=443) DEBUG 04-22 01:43:06 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_PP1 pid=443) DEBUG 04-22 01:43:06 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_PP1 pid=443) DEBUG 04-22 01:43:06 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_PP1 pid=443) DEBUG 04-22 01:43:06 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_PP1 pid=443) DEBUG 04-22 01:43:06 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_PP0 pid=442) DEBUG 04-22 01:43:06 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_PP0 pid=442) DEBUG 04-22 01:43:06 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_PP0 pid=442) DEBUG 04-22 01:43:06 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_PP0 pid=442) DEBUG 04-22 01:43:06 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_PP0 pid=442) DEBUG 04-22 01:43:06 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_PP0 pid=442) DEBUG 04-22 01:43:07 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/6749c2ea5a/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(Worker_PP1 pid=443) DEBUG 04-22 01:43:07 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_PP1 pid=443) DEBUG 04-22 01:43:07 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_PP1 pid=443) DEBUG 04-22 01:43:07 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.6 ms +(Worker_PP1 pid=443) DEBUG 04-22 01:43:07 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_PP1 pid=443) DEBUG 04-22 01:43:07 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_PP0 pid=442) DEBUG 04-22 01:43:07 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_PP0 pid=442) DEBUG 04-22 01:43:07 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_PP0 pid=442) DEBUG 04-22 01:43:07 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_PP0 pid=442) DEBUG 04-22 01:43:07 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_PP0 pid=442) DEBUG 04-22 01:43:07 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_PP0 pid=442) DEBUG 04-22 01:43:08 [compilation/backends.py:377] Store the 16-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_16', '/data/.cache/vllm/torch_compile_cache/6749c2ea5a/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_16') +(Worker_PP0 pid=442) INFO 04-22 01:43:08 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.20 s +(Worker_PP0 pid=442) DEBUG 04-22 01:43:08 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/6749c2ea5a/rank_0_0/backbone/computation_graph.py +(Worker_PP0 pid=442) INFO 04-22 01:43:08 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/378a2f90d7ba67f1593b3e36e293b0841d472907a0cee2c9633e63eaed3ee443/rank_0_0/model +(Worker_PP0 pid=442) INFO 04-22 01:43:08 [compilation/monitor.py:48] torch.compile took 8.22 s in total +(Worker_PP0 pid=442) INFO 04-22 01:43:09 [compilation/monitor.py:76] Initial profiling/warmup run took 0.37 s +(Worker_PP0 pid=442) INFO 04-22 01:43:09 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_PP0 pid=442) DEBUG 04-22 01:43:09 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_PP0 pid=442) INFO 04-22 01:43:09 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_PP0 pid=442) DEBUG 04-22 01:43:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP0 pid=442) DEBUG 04-22 01:43:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP0 pid=442) DEBUG 04-22 01:43:09 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_PP0 pid=442) DEBUG 04-22 01:43:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP0 pid=442) DEBUG 04-22 01:43:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP0 pid=442) DEBUG 04-22 01:43:09 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_PP0 pid=442) DEBUG 04-22 01:43:09 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 114.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(Worker_PP0 pid=442) DEBUG 04-22 01:43:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP0 pid=442) DEBUG 04-22 01:43:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP0 pid=442) DEBUG 04-22 01:43:09 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_PP0 pid=442) DEBUG 04-22 01:43:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP0 pid=442) DEBUG 04-22 01:43:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP0 pid=442) DEBUG 04-22 01:43:09 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_PP0 pid=442) DEBUG 04-22 01:43:09 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 256.00 MiB first-capture + (51-1) × 2.00 MiB per-graph +(Worker_PP0 pid=442) DEBUG 04-22 01:43:10 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_PP0 pid=442) INFO 04-22 01:43:10 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.54 GiB total +(Worker_PP0 pid=442) DEBUG 04-22 01:43:10 [v1/worker/gpu_worker.py:424] Initial free memory: 77.76 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_PP0 pid=442) DEBUG 04-22 01:43:10 [v1/worker/gpu_worker.py:430] Free memory after profiling: 70.02 GiB (total), 67.49 GiB (within requested) +(Worker_PP0 pid=442) DEBUG 04-22 01:43:10 [v1/worker/gpu_worker.py:435] Memory profiling takes 9.87 seconds. Total non KV cache memory: 8.69GiB; torch peak memory increase: 1.1GiB; non-torch forward increase memory: 0.07GiB; weights memory: 7.51GiB. +(Worker_PP0 pid=442) INFO 04-22 01:43:10 [v1/worker/gpu_worker.py:436] Available KV cache memory: 66.54 GiB +(Worker_PP0 pid=442) INFO 04-22 01:43:10 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9569 to maintain the same effective KV cache size. +(Worker_PP0 pid=442) DEBUG 04-22 01:43:10 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=243) DEBUG 04-22 01:43:10 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=243) DEBUG 04-22 01:43:10 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(APIServer pid=1) DEBUG 04-22 01:43:13 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_PP1 pid=443) INFO 04-22 01:43:14 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_PP1 pid=443) DEBUG 04-22 01:43:14 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_PP1 pid=443) INFO 04-22 01:43:14 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_PP1 pid=443) DEBUG 04-22 01:43:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP1 pid=443) DEBUG 04-22 01:43:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP1 pid=443) DEBUG 04-22 01:43:14 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_PP1 pid=443) DEBUG 04-22 01:43:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP1 pid=443) DEBUG 04-22 01:43:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP1 pid=443) DEBUG 04-22 01:43:14 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_PP1 pid=443) DEBUG 04-22 01:43:14 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 134.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(Worker_PP1 pid=443) DEBUG 04-22 01:43:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP1 pid=443) DEBUG 04-22 01:43:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP1 pid=443) DEBUG 04-22 01:43:14 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_PP1 pid=443) DEBUG 04-22 01:43:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP1 pid=443) DEBUG 04-22 01:43:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP1 pid=443) DEBUG 04-22 01:43:14 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_PP1 pid=443) DEBUG 04-22 01:43:14 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 256.00 MiB first-capture + (51-1) × 2.00 MiB per-graph +(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_PP1 pid=443) INFO 04-22 01:43:15 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.54 GiB total +(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [v1/worker/gpu_worker.py:424] Initial free memory: 77.76 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [v1/worker/gpu_worker.py:430] Free memory after profiling: 69.8 GiB (total), 67.26 GiB (within requested) +(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [v1/worker/gpu_worker.py:435] Memory profiling takes 14.85 seconds. Total non KV cache memory: 9.6GiB; torch peak memory increase: 2.01GiB; non-torch forward increase memory: 0.07GiB; weights memory: 7.51GiB. +(Worker_PP1 pid=443) INFO 04-22 01:43:15 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9569 to maintain the same effective KV cache size. +(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=243) DEBUG 04-22 01:43:15 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=243) INFO 04-22 01:43:15 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 1,075,312 tokens +(EngineCore pid=243) INFO 04-22 01:43:15 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 131.26x +(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_PP0 pid=442) DEBUG 04-22 01:43:15 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=243) DEBUG 04-22 01:43:15 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_PP1 pid=443) 2026-04-22 01:43:15,416 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_PP0 pid=442) 2026-04-22 01:43:15,416 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP0 pid=442) DEBUG 04-22 01:43:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP1 pid=443) 2026-04-22 01:43:15,424 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_PP0 pid=442) 2026-04-22 01:43:15,424 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=480, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=464, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=448, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP0 pid=442) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:43:37 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:43:37 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 01:43:37 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:43:37 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 01:43:37 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 01:43:37 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-3.1-8B-Instruct +(APIServer pid=1) INFO 04-22 01:43:37 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 01:43:37 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:43:37 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-3.1-8B-Instruct', 'model': 'meta-llama/Llama-3.1-8B-Instruct', 'max_model_len': 8192, 'pipeline_parallel_size': 4, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 01:43:37 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 01:43:37 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 01:43:37 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0009235 secs +(APIServer pid=1) INFO 04-22 01:43:37 [config/model.py:549] Resolved architecture: LlamaForCausalLM +(APIServer pid=1) INFO 04-22 01:43:37 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 01:43:37 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 01:43:37 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 01:43:37 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 01:43:37 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 01:43:37 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 01:43:37 [config/parallel.py:743] Defaulting to use mp for distributed inference +(APIServer pid=1) INFO 04-22 01:43:37 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 01:43:37 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 01:43:37 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 01:43:37 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:43:38 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:43:38 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 01:43:41 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:43:41 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:43:41 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:43:41 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:43:41 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:43:41 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:43:41 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:43:41 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:43:41 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:43:41 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:43:41 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:43:41 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:43:46 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=242) DEBUG 04-22 01:43:48 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 01:43:48 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=242) DEBUG 04-22 01:43:48 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/41703b62-cf91-44bb-8658-ca90b0b1dddd'], outputs=['ipc:///tmp/ded797e4-47d0-44f5-b379-ddd276a01a31'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=242) DEBUG 04-22 01:43:48 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=242) DEBUG 04-22 01:43:48 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=242) DEBUG 04-22 01:43:48 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=242) DEBUG 04-22 01:43:48 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=242) DEBUG 04-22 01:43:48 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=242) INFO 04-22 01:43:48 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=4, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=242) WARNING 04-22 01:43:48 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. +(EngineCore pid=242) INFO 04-22 01:43:48 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.130.2.185 (local), world_size=4, local_world_size=4 +(EngineCore pid=242) DEBUG 04-22 01:43:48 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/097fc3f9-a611-40fe-aaca-2a5bf508b979 +(EngineCore pid=242) DEBUG 04-22 01:43:48 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1, 2, 3], buffer_handle=(4, 16777216, 10, 'psm_ed275266'), local_subscribe_addr='ipc:///tmp/097fc3f9-a611-40fe-aaca-2a5bf508b979', local_notify_addr='ipc:///tmp/ec1e175c-82f2-4d1b-a146-12d710e40bc0', remote_subscribe_addr=None, remote_addr_ipv6=False) +DEBUG 04-22 01:43:51 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:43:51 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:43:51 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:43:51 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:43:51 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:43:51 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:43:51 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:43:51 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:43:51 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:43:51 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:43:51 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:43:51 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:43:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:43:51 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:43:51 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:43:51 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:43:51 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:43:51 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:43:51 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:43:51 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:43:51 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:43:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:43:51 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:43:51 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:43:51 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:43:51 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:43:51 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:43:51 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:43:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:43:51 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:43:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:43:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:43:51 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:43:51 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:43:51 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:43:51 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:43:51 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:43:51 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:43:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:43:51 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:43:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:43:51 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:43:51 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:43:51 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:43:51 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:43:51 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:43:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:43:51 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:43:56 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:43:57 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:43:57 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:43:57 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(APIServer pid=1) DEBUG 04-22 01:43:58 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +DEBUG 04-22 01:43:58 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:43:58 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:43:58 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:43:58 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 01:43:58 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:43:58 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:43:58 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:43:58 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 01:43:58 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:43:58 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:43:58 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:43:58 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 01:43:58 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:43:58 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:43:58 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:43:58 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(Worker pid=441) DEBUG 04-22 01:44:00 [distributed/parallel_state.py:1356] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:44231 backend=nccl +(Worker pid=441) INFO 04-22 01:44:00 [distributed/parallel_state.py:1400] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:44231 backend=nccl +(Worker pid=444) DEBUG 04-22 01:44:00 [distributed/parallel_state.py:1356] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:44231 backend=nccl +(Worker pid=444) INFO 04-22 01:44:00 [distributed/parallel_state.py:1400] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:44231 backend=nccl +(Worker pid=443) DEBUG 04-22 01:44:00 [distributed/parallel_state.py:1356] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:44231 backend=nccl +(Worker pid=443) INFO 04-22 01:44:00 [distributed/parallel_state.py:1400] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:44231 backend=nccl +(Worker pid=442) DEBUG 04-22 01:44:00 [distributed/parallel_state.py:1356] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:44231 backend=nccl +(Worker pid=442) INFO 04-22 01:44:00 [distributed/parallel_state.py:1400] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:44231 backend=nccl +[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +(Worker pid=442) DEBUG 04-22 01:44:00 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=443) DEBUG 04-22 01:44:00 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=441) DEBUG 04-22 01:44:00 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=444) DEBUG 04-22 01:44:00 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +(Worker pid=441) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=442) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=443) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=441) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=442) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=443) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=444) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=444) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=441) DEBUG 04-22 01:44:01 [utils/nccl.py:34] Found nccl from library libnccl.so.2 +(Worker pid=441) INFO 04-22 01:44:01 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(Worker pid=441) INFO 04-22 01:44:02 [distributed/parallel_state.py:1716] rank 0 in world size 4 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(Worker pid=443) DEBUG 04-22 01:44:02 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=77.71GiB, total_memory=79.19GiB, cuda_memory=1.48GiB, torch_memory=0.0GiB, non_torch_memory=1.48GiB, timestamp=1776822242.6008415, auto_measure=True +(Worker pid=443) DEBUG 04-22 01:44:02 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=441) DEBUG 04-22 01:44:02 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=77.71GiB, total_memory=79.19GiB, cuda_memory=1.48GiB, torch_memory=0.0GiB, non_torch_memory=1.48GiB, timestamp=1776822242.6116424, auto_measure=True +(Worker pid=444) DEBUG 04-22 01:44:02 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=77.71GiB, total_memory=79.19GiB, cuda_memory=1.48GiB, torch_memory=0.0GiB, non_torch_memory=1.48GiB, timestamp=1776822242.6117628, auto_measure=True +(Worker pid=441) DEBUG 04-22 01:44:02 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=444) DEBUG 04-22 01:44:02 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=443) DEBUG 04-22 01:44:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=443) DEBUG 04-22 01:44:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=444) DEBUG 04-22 01:44:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=441) DEBUG 04-22 01:44:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=444) DEBUG 04-22 01:44:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=441) DEBUG 04-22 01:44:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=443) DEBUG 04-22 01:44:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=444) DEBUG 04-22 01:44:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=441) DEBUG 04-22 01:44:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=444) DEBUG 04-22 01:44:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=441) DEBUG 04-22 01:44:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=442) DEBUG 04-22 01:44:02 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=77.71GiB, total_memory=79.19GiB, cuda_memory=1.48GiB, torch_memory=0.0GiB, non_torch_memory=1.48GiB, timestamp=1776822242.755553, auto_measure=True +(Worker pid=442) DEBUG 04-22 01:44:02 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=443) DEBUG 04-22 01:44:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=441) DEBUG 04-22 01:44:02 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(Worker pid=444) DEBUG 04-22 01:44:02 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=441) DEBUG 04-22 01:44:02 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=443) DEBUG 04-22 01:44:02 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=441) DEBUG 04-22 01:44:02 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(Worker_PP0 pid=441) INFO 04-22 01:44:02 [v1/worker/gpu_model_runner.py:4735] Starting to load model meta-llama/Llama-3.1-8B-Instruct... +(Worker pid=442) DEBUG 04-22 01:44:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=442) DEBUG 04-22 01:44:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=442) DEBUG 04-22 01:44:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=442) DEBUG 04-22 01:44:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=442) DEBUG 04-22 01:44:02 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker_PP0 pid=441) DEBUG 04-22 01:44:03 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(Worker_PP0 pid=441) INFO 04-22 01:44:03 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(Worker_PP0 pid=441) INFO 04-22 01:44:03 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(Worker_PP3 pid=444) DEBUG 04-22 01:44:03 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_PP2 pid=443) DEBUG 04-22 01:44:03 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_PP3 pid=444) DEBUG 04-22 01:44:03 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_PP3 pid=444) DEBUG 04-22 01:44:03 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 17, 'silu_and_mul': 8, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_PP3 pid=444) DEBUG 04-22 01:44:03 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_PP2 pid=443) DEBUG 04-22 01:44:03 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_PP2 pid=443) DEBUG 04-22 01:44:03 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 16, 'silu_and_mul': 8, 'rotary_embedding': 1, 'apply_rotary_emb': 1}) +(Worker_PP2 pid=443) DEBUG 04-22 01:44:03 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_PP0 pid=441) DEBUG 04-22 01:44:03 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_PP0 pid=441) DEBUG 04-22 01:44:03 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_PP0 pid=441) DEBUG 04-22 01:44:03 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 16, 'silu_and_mul': 8, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1}) +(Worker_PP0 pid=441) DEBUG 04-22 01:44:03 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_PP1 pid=442) DEBUG 04-22 01:44:03 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_PP1 pid=442) DEBUG 04-22 01:44:03 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_PP1 pid=442) DEBUG 04-22 01:44:03 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 16, 'silu_and_mul': 8, 'rotary_embedding': 1, 'apply_rotary_emb': 1}) +(Worker_PP1 pid=442) DEBUG 04-22 01:44:03 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_PP2 pid=443) DEBUG 04-22 01:44:03 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00001-of-00004.safetensors']] +(Worker_PP0 pid=441) DEBUG 04-22 01:44:03 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00001-of-00004.safetensors', 'model-00002-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00004-of-00004.safetensors']] +(Worker_PP3 pid=444) DEBUG 04-22 01:44:03 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00003-of-00004.safetensors']] +(Worker_PP1 pid=442) DEBUG 04-22 01:44:03 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00001-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00002-of-00004.safetensors']] +(Worker_PP0 pid=441) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 +(Worker_PP1 pid=442) DEBUG 04-22 01:44:06 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP2 pid=443) DEBUG 04-22 01:44:06 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP3 pid=444) DEBUG 04-22 01:44:06 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP1 pid=442) DEBUG 04-22 01:44:06 [compilation/decorators.py:528] Start compiling function +(Worker_PP2 pid=443) DEBUG 04-22 01:44:06 [compilation/decorators.py:528] Start compiling function +(Worker_PP3 pid=444) DEBUG 04-22 01:44:06 [compilation/decorators.py:528] Start compiling function +(EngineCore pid=242) DEBUG 04-22 01:44:07 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(APIServer pid=1) DEBUG 04-22 01:44:08 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/sequence.py +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/sequence.py +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/sequence.py +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/sequence.py +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=9f910dc055 comp=e546579c48 code=7f2f48347ccdde3394423985176b64bbf478231b0e1845a35438330041d319b6 dir=/data/.cache/vllm/torch_compile_cache/c5dad8de95/rank_2_0/backbone +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=cfaafe1362 comp=e546579c48 code=7f2f48347ccdde3394423985176b64bbf478231b0e1845a35438330041d319b6 dir=/data/.cache/vllm/torch_compile_cache/f350f8e302/rank_3_0/backbone +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] Vllm config hash: 9f910dc055 +(Worker_PP0 pid=441) INFO 04-22 01:44:08 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/55a7723969/rank_0_0/backbone for vLLM's torch.compile +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=980a30b2da comp=e546579c48 code=7f24f63a7548e6fdf64e57e8c7d52119b84b1d43a0b72f3e4a2b71446193f817 dir=/data/.cache/vllm/torch_compile_cache/55a7723969/rank_0_0/backbone +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] Vllm config hash: cfaafe1362 +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=9f910dc055 comp=e546579c48 code=7f2f48347ccdde3394423985176b64bbf478231b0e1845a35438330041d319b6 dir=/data/.cache/vllm/torch_compile_cache/c5dad8de95/rank_1_0/backbone +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] Vllm config hash: 980a30b2da +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] Vllm config hash: 9f910dc055 +(Worker_PP0 pid=441) INFO 04-22 01:44:08 [compilation/backends.py:1111] Dynamo bytecode transform time: 1.79 s +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(Worker_PP2 pid=443) DEBUG 04-22 01:44:09 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_PP2 pid=443) DEBUG 04-22 01:44:09 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_PP2 pid=443) DEBUG 04-22 01:44:09 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_PP2 pid=443) DEBUG 04-22 01:44:09 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_PP2 pid=443) DEBUG 04-22 01:44:09 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_PP3 pid=444) DEBUG 04-22 01:44:09 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_PP3 pid=444) DEBUG 04-22 01:44:09 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_PP3 pid=444) DEBUG 04-22 01:44:09 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_PP3 pid=444) DEBUG 04-22 01:44:09 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_PP3 pid=444) DEBUG 04-22 01:44:09 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_PP1 pid=442) DEBUG 04-22 01:44:09 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_PP1 pid=442) DEBUG 04-22 01:44:09 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_PP1 pid=442) DEBUG 04-22 01:44:09 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_PP1 pid=442) DEBUG 04-22 01:44:09 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_PP1 pid=442) DEBUG 04-22 01:44:09 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_PP0 pid=441) DEBUG 04-22 01:44:09 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_PP0 pid=441) DEBUG 04-22 01:44:09 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_PP0 pid=441) DEBUG 04-22 01:44:09 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_PP0 pid=441) DEBUG 04-22 01:44:09 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_PP0 pid=441) DEBUG 04-22 01:44:09 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_PP0 pid=441) INFO 04-22 01:44:11 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(Worker_PP0 pid=441) DEBUG 04-22 01:44:11 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/55a7723969/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(Worker_PP0 pid=441) DEBUG 04-22 01:44:11 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_PP0 pid=441) DEBUG 04-22 01:44:11 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_PP0 pid=441) DEBUG 04-22 01:44:11 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_PP0 pid=441) DEBUG 04-22 01:44:11 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_PP0 pid=441) DEBUG 04-22 01:44:11 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_PP2 pid=443) DEBUG 04-22 01:44:11 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_PP2 pid=443) DEBUG 04-22 01:44:11 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_PP2 pid=443) DEBUG 04-22 01:44:11 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_PP2 pid=443) DEBUG 04-22 01:44:11 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_PP2 pid=443) DEBUG 04-22 01:44:11 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_PP3 pid=444) DEBUG 04-22 01:44:11 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_PP3 pid=444) DEBUG 04-22 01:44:11 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_PP3 pid=444) DEBUG 04-22 01:44:11 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms +(Worker_PP3 pid=444) DEBUG 04-22 01:44:11 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_PP3 pid=444) DEBUG 04-22 01:44:11 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_PP1 pid=442) DEBUG 04-22 01:44:11 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_PP1 pid=442) DEBUG 04-22 01:44:11 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms +(Worker_PP1 pid=442) DEBUG 04-22 01:44:11 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms +(Worker_PP1 pid=442) DEBUG 04-22 01:44:11 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_PP1 pid=442) DEBUG 04-22 01:44:11 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_PP0 pid=441) DEBUG 04-22 01:44:13 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/55a7723969/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(Worker_PP0 pid=441) DEBUG 04-22 01:44:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_PP0 pid=441) DEBUG 04-22 01:44:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_PP0 pid=441) DEBUG 04-22 01:44:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_PP0 pid=441) DEBUG 04-22 01:44:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_PP0 pid=441) DEBUG 04-22 01:44:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.6 ms +(Worker_PP2 pid=443) DEBUG 04-22 01:44:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_PP2 pid=443) DEBUG 04-22 01:44:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_PP2 pid=443) DEBUG 04-22 01:44:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_PP2 pid=443) DEBUG 04-22 01:44:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_PP2 pid=443) DEBUG 04-22 01:44:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_PP3 pid=444) DEBUG 04-22 01:44:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_PP3 pid=444) DEBUG 04-22 01:44:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_PP3 pid=444) DEBUG 04-22 01:44:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_PP3 pid=444) DEBUG 04-22 01:44:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_PP3 pid=444) DEBUG 04-22 01:44:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_PP1 pid=442) DEBUG 04-22 01:44:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_PP1 pid=442) DEBUG 04-22 01:44:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_PP1 pid=442) DEBUG 04-22 01:44:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_PP1 pid=442) DEBUG 04-22 01:44:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_PP1 pid=442) DEBUG 04-22 01:44:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_PP0 pid=441) DEBUG 04-22 01:44:13 [compilation/backends.py:377] Store the 8-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_8', '/data/.cache/vllm/torch_compile_cache/55a7723969/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_8') +(Worker_PP0 pid=441) INFO 04-22 01:44:13 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.28 s +(Worker_PP0 pid=441) DEBUG 04-22 01:44:13 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/55a7723969/rank_0_0/backbone/computation_graph.py +(Worker_PP0 pid=441) INFO 04-22 01:44:14 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/0ec2ebc3c5e681c25779ace37f1592abad189ac56456d07660699b9de0e0511a/rank_0_0/model +(Worker_PP0 pid=441) INFO 04-22 01:44:14 [compilation/monitor.py:48] torch.compile took 7.37 s in total +(Worker_PP0 pid=441) INFO 04-22 01:44:14 [compilation/monitor.py:76] Initial profiling/warmup run took 0.33 s +(Worker_PP2 pid=443) INFO 04-22 01:44:14 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_PP2 pid=443) DEBUG 04-22 01:44:14 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_PP2 pid=443) INFO 04-22 01:44:14 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_PP0 pid=441) INFO 04-22 01:44:14 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_PP0 pid=441) DEBUG 04-22 01:44:14 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_PP0 pid=441) INFO 04-22 01:44:14 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_PP2 pid=443) DEBUG 04-22 01:44:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP2 pid=443) DEBUG 04-22 01:44:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP2 pid=443) DEBUG 04-22 01:44:14 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_PP2 pid=443) DEBUG 04-22 01:44:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP2 pid=443) DEBUG 04-22 01:44:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP2 pid=443) DEBUG 04-22 01:44:14 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_PP2 pid=443) DEBUG 04-22 01:44:14 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 112.00 MiB first-capture + (51-1) × 1.00 MiB per-graph +(Worker_PP2 pid=443) DEBUG 04-22 01:44:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP2 pid=443) DEBUG 04-22 01:44:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP2 pid=443) DEBUG 04-22 01:44:14 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_PP2 pid=443) DEBUG 04-22 01:44:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP2 pid=443) DEBUG 04-22 01:44:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP2 pid=443) DEBUG 04-22 01:44:14 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_PP2 pid=443) DEBUG 04-22 01:44:14 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 256.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(Worker_PP1 pid=442) INFO 04-22 01:44:15 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_PP1 pid=442) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_PP1 pid=442) INFO 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 112.00 MiB first-capture + (51-1) × 1.00 MiB per-graph +(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 256.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(Worker_PP1 pid=442) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP1 pid=442) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP1 pid=442) DEBUG 04-22 01:44:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_PP1 pid=442) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP1 pid=442) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP1 pid=442) DEBUG 04-22 01:44:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_PP1 pid=442) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 112.00 MiB first-capture + (51-1) × 1.00 MiB per-graph +(Worker_PP1 pid=442) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP1 pid=442) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP1 pid=442) DEBUG 04-22 01:44:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_PP1 pid=442) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP1 pid=442) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP1 pid=442) DEBUG 04-22 01:44:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_PP1 pid=442) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 256.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(Worker_PP2 pid=443) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_PP2 pid=443) INFO 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.49 GiB total +(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_PP0 pid=441) INFO 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.49 GiB total +(Worker_PP2 pid=443) DEBUG 04-22 01:44:15 [v1/worker/gpu_worker.py:424] Initial free memory: 77.71 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_PP2 pid=443) DEBUG 04-22 01:44:15 [v1/worker/gpu_worker.py:430] Free memory after profiling: 74.07 GiB (total), 71.59 GiB (within requested) +(Worker_PP2 pid=443) DEBUG 04-22 01:44:15 [v1/worker/gpu_worker.py:435] Memory profiling takes 9.14 seconds. Total non KV cache memory: 4.4GiB; torch peak memory increase: 1.05GiB; non-torch forward increase memory: 0.07GiB; weights memory: 3.28GiB. +(Worker_PP2 pid=443) INFO 04-22 01:44:15 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9562 to maintain the same effective KV cache size. +(Worker_PP2 pid=443) DEBUG 04-22 01:44:15 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_PP1 pid=442) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_PP1 pid=442) INFO 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.49 GiB total +(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [v1/worker/gpu_worker.py:424] Initial free memory: 77.71 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [v1/worker/gpu_worker.py:430] Free memory after profiling: 73.22 GiB (total), 70.74 GiB (within requested) +(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [v1/worker/gpu_worker.py:435] Memory profiling takes 9.27 seconds. Total non KV cache memory: 5.43GiB; torch peak memory increase: 1.1GiB; non-torch forward increase memory: 0.07GiB; weights memory: 4.26GiB. +(Worker_PP0 pid=441) INFO 04-22 01:44:15 [v1/worker/gpu_worker.py:436] Available KV cache memory: 69.8 GiB +(Worker_PP0 pid=441) INFO 04-22 01:44:15 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9562 to maintain the same effective KV cache size. +(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=242) DEBUG 04-22 01:44:15 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=242) DEBUG 04-22 01:44:15 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_PP1 pid=442) DEBUG 04-22 01:44:16 [v1/worker/gpu_worker.py:424] Initial free memory: 77.71 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_PP1 pid=442) DEBUG 04-22 01:44:16 [v1/worker/gpu_worker.py:430] Free memory after profiling: 74.07 GiB (total), 71.59 GiB (within requested) +(Worker_PP1 pid=442) DEBUG 04-22 01:44:16 [v1/worker/gpu_worker.py:435] Memory profiling takes 9.52 seconds. Total non KV cache memory: 4.4GiB; torch peak memory increase: 1.05GiB; non-torch forward increase memory: 0.07GiB; weights memory: 3.28GiB. +(Worker_PP1 pid=442) INFO 04-22 01:44:16 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9562 to maintain the same effective KV cache size. +(Worker_PP1 pid=442) DEBUG 04-22 01:44:16 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=242) DEBUG 04-22 01:44:16 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=242) DEBUG 04-22 01:44:16 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(APIServer pid=1) DEBUG 04-22 01:44:18 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_PP3 pid=444) INFO 04-22 01:44:20 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_PP3 pid=444) DEBUG 04-22 01:44:20 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_PP3 pid=444) INFO 04-22 01:44:20 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_PP3 pid=444) DEBUG 04-22 01:44:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP3 pid=444) DEBUG 04-22 01:44:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP3 pid=444) DEBUG 04-22 01:44:20 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_PP3 pid=444) DEBUG 04-22 01:44:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP3 pid=444) DEBUG 04-22 01:44:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP3 pid=444) DEBUG 04-22 01:44:20 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_PP3 pid=444) DEBUG 04-22 01:44:20 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 132.00 MiB first-capture + (51-1) × 1.00 MiB per-graph +(Worker_PP3 pid=444) DEBUG 04-22 01:44:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP3 pid=444) DEBUG 04-22 01:44:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP3 pid=444) DEBUG 04-22 01:44:20 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_PP3 pid=444) DEBUG 04-22 01:44:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP3 pid=444) DEBUG 04-22 01:44:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP3 pid=444) DEBUG 04-22 01:44:20 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_PP3 pid=444) DEBUG 04-22 01:44:20 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 256.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(Worker_PP3 pid=444) DEBUG 04-22 01:44:20 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_PP3 pid=444) INFO 04-22 01:44:20 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.49 GiB total +(Worker_PP3 pid=444) DEBUG 04-22 01:44:21 [v1/worker/gpu_worker.py:424] Initial free memory: 77.71 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_PP3 pid=444) DEBUG 04-22 01:44:21 [v1/worker/gpu_worker.py:430] Free memory after profiling: 73.0 GiB (total), 70.52 GiB (within requested) +(Worker_PP3 pid=444) DEBUG 04-22 01:44:21 [v1/worker/gpu_worker.py:435] Memory profiling takes 14.46 seconds. Total non KV cache memory: 6.34GiB; torch peak memory increase: 2.01GiB; non-torch forward increase memory: 0.07GiB; weights memory: 4.26GiB. +(Worker_PP3 pid=444) INFO 04-22 01:44:21 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9562 to maintain the same effective KV cache size. +(Worker_PP3 pid=444) DEBUG 04-22 01:44:21 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=242) DEBUG 04-22 01:44:21 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=242) INFO 04-22 01:44:21 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 2,257,264 tokens +(EngineCore pid=242) INFO 04-22 01:44:21 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 275.54x +(Worker_PP3 pid=444) DEBUG 04-22 01:44:21 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_PP0 pid=441) DEBUG 04-22 01:44:21 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_PP1 pid=442) DEBUG 04-22 01:44:21 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_PP2 pid=443) DEBUG 04-22 01:44:21 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=242) DEBUG 04-22 01:44:21 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=242) DEBUG 04-22 01:44:21 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_PP3 pid=444) 2026-04-22 01:44:21,255 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_PP2 pid=443) 2026-04-22 01:44:21,255 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_PP0 pid=441) 2026-04-22 01:44:21,255 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_PP1 pid=442) 2026-04-22 01:44:21,257 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_PP3 pid=444) DEBUG 04-22 01:44:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP2 pid=443) DEBUG 04-22 01:44:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP0 pid=441) DEBUG 04-22 01:44:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP1 pid=442) DEBUG 04-22 01:44:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP3 pid=444) 2026-04-22 01:44:21,264 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_PP2 pid=443) 2026-04-22 01:44:21,264 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_PP0 pid=441) 2026-04-22 01:44:21,264 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_PP1 pid=442) 2026-04-22 01:44:21,264 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_PP3 pid=444) DEBUG 04-22 01:44:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP3 pid=444) DEBUG 04-22 01:44:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP3 pid=444) DEBUG 04-22 01:44:21 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_PP3 pid=444) DEBUG 04-22 01:44:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP3 pid=444) DEBUG 04-22 01:44:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP3 pid=444) DEBUG 04-22 01:44:21 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_PP3 pid=444) DEBUG 04-22 01:44:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP2 pid=443) DEBUG 04-22 01:44:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP3 pid=444) DEBUG 04-22 01:44:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP3 pid=444) DEBUG 04-22 01:44:21 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=480, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_PP2 pid=443) DEBUG 04-22 01:44:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP2 pid=443) DEBUG 04-22 01:44:21 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_PP3 pid=444) DEBUG 04-22 01:44:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP2 pid=443) DEBUG 04-22 01:44:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_PP0 pid=441) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:35:30 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:35:30 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 01:35:30 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:35:30 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 01:35:30 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 01:35:30 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-3.1-8B-Instruct +(APIServer pid=1) INFO 04-22 01:35:30 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 01:35:30 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:35:30 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-3.1-8B-Instruct', 'model': 'meta-llama/Llama-3.1-8B-Instruct', 'max_model_len': 8192, 'tensor_parallel_size': 2, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 01:35:30 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 01:35:31 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 01:35:31 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0033037 secs +(APIServer pid=1) INFO 04-22 01:35:31 [config/model.py:549] Resolved architecture: LlamaForCausalLM +(APIServer pid=1) INFO 04-22 01:35:31 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 01:35:31 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 01:35:31 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 01:35:31 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 01:35:31 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 01:35:31 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 01:35:31 [config/parallel.py:743] Defaulting to use mp for distributed inference +(APIServer pid=1) INFO 04-22 01:35:31 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 01:35:31 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(APIServer pid=1) DEBUG 04-22 01:35:32 [config/vllm.py:1530] Max num batched tokens below allreduce-rms fusion threshold, allreduce-rms fusion will be enabled for all num_tokens. +(APIServer pid=1) INFO 04-22 01:35:32 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(APIServer pid=1) DEBUG 04-22 01:35:32 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 01:35:32 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:35:33 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:35:33 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 01:35:36 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:35:36 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:35:36 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:35:36 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:35:37 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:35:37 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:35:37 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:35:37 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:35:37 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:35:37 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:35:37 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:35:37 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:35:41 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(APIServer pid=1) DEBUG 04-22 01:35:43 [v1/engine/utils.py:1042] Waiting for 1 local, 0 remote core engine proc(s) to connect. +(EngineCore pid=245) DEBUG 04-22 01:35:43 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 01:35:43 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=245) DEBUG 04-22 01:35:43 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/0d7210ec-58b5-4ffb-9e6f-cde13e376fc7'], outputs=['ipc:///tmp/424bfdcd-ff34-43c0-b07a-8ca1f43e5154'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=245) DEBUG 04-22 01:35:43 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=245) DEBUG 04-22 01:35:43 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=245) DEBUG 04-22 01:35:43 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=245) DEBUG 04-22 01:35:43 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=245) DEBUG 04-22 01:35:43 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=245) INFO 04-22 01:35:43 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=245) WARNING 04-22 01:35:43 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. +(EngineCore pid=245) INFO 04-22 01:35:43 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.129.9.32 (local), world_size=2, local_world_size=2 +(EngineCore pid=245) DEBUG 04-22 01:35:43 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/4aa6a999-a067-4b8d-ae8d-8df7a86cf373 +(EngineCore pid=245) DEBUG 04-22 01:35:43 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1], buffer_handle=(2, 16777216, 10, 'psm_d30f3657'), local_subscribe_addr='ipc:///tmp/4aa6a999-a067-4b8d-ae8d-8df7a86cf373', local_notify_addr='ipc:///tmp/cdf14d75-6649-4e99-9342-5c86670188e6', remote_subscribe_addr=None, remote_addr_ipv6=False) +DEBUG 04-22 01:35:46 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:35:46 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:35:46 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:35:46 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:35:46 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:35:46 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:35:46 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:35:46 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:35:46 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:35:46 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:35:46 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:35:46 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:35:46 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:35:46 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:35:46 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:35:46 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:35:46 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:35:46 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:35:46 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:35:46 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:35:46 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:35:46 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:35:46 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:35:46 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:35:51 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:35:51 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:35:53 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:35:53 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:35:53 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:35:53 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 01:35:53 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:35:53 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:35:53 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:35:53 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) DEBUG 04-22 01:35:53 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker pid=444) DEBUG 04-22 01:35:53 [distributed/parallel_state.py:1356] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:44935 backend=nccl +(Worker pid=444) INFO 04-22 01:35:53 [distributed/parallel_state.py:1400] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:44935 backend=nccl +(Worker pid=445) DEBUG 04-22 01:35:53 [distributed/parallel_state.py:1356] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:44935 backend=nccl +(Worker pid=445) INFO 04-22 01:35:53 [distributed/parallel_state.py:1400] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:44935 backend=nccl +[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +(Worker pid=445) DEBUG 04-22 01:35:53 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=444) DEBUG 04-22 01:35:53 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +(Worker pid=444) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=444) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=445) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=445) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=444) DEBUG 04-22 01:35:54 [utils/nccl.py:34] Found nccl from library libnccl.so.2 +(Worker pid=444) INFO 04-22 01:35:54 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 +(Worker pid=445) DEBUG 04-22 01:35:55 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=444) DEBUG 04-22 01:35:55 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=444) DEBUG 04-22 01:35:55 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/adab702a-4289-4a96-8bc9-d0d6e0d605b8 +(Worker pid=444) DEBUG 04-22 01:35:55 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1], buffer_handle=(1, 4194304, 6, 'psm_29a8fa29'), local_subscribe_addr='ipc:///tmp/adab702a-4289-4a96-8bc9-d0d6e0d605b8', local_notify_addr='ipc:///tmp/1eb60031-4fb6-4f99-803a-b938389780d0', remote_subscribe_addr=None, remote_addr_ipv6=False) +(Worker pid=445) DEBUG 04-22 01:35:55 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/adab702a-4289-4a96-8bc9-d0d6e0d605b8 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(Worker pid=444) INFO 04-22 01:35:55 [distributed/parallel_state.py:1716] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(Worker pid=445) DEBUG 04-22 01:35:55 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.66GiB, total_memory=79.19GiB, cuda_memory=1.53GiB, torch_memory=0.02GiB, non_torch_memory=1.51GiB, timestamp=1776821755.3217318, auto_measure=True +(Worker pid=445) DEBUG 04-22 01:35:55 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=444) DEBUG 04-22 01:35:55 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.66GiB, total_memory=79.19GiB, cuda_memory=1.53GiB, torch_memory=0.02GiB, non_torch_memory=1.51GiB, timestamp=1776821755.353211, auto_measure=True +(Worker pid=444) DEBUG 04-22 01:35:55 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=445) DEBUG 04-22 01:35:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=445) DEBUG 04-22 01:35:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=445) DEBUG 04-22 01:35:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=444) DEBUG 04-22 01:35:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=444) DEBUG 04-22 01:35:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=445) DEBUG 04-22 01:35:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=444) DEBUG 04-22 01:35:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=445) DEBUG 04-22 01:35:55 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=444) DEBUG 04-22 01:35:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=444) DEBUG 04-22 01:35:55 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(Worker pid=444) DEBUG 04-22 01:35:55 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=444) DEBUG 04-22 01:35:55 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(Worker_TP0 pid=444) INFO 04-22 01:35:55 [v1/worker/gpu_model_runner.py:4735] Starting to load model meta-llama/Llama-3.1-8B-Instruct... +(Worker_TP0 pid=444) DEBUG 04-22 01:35:55 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(Worker_TP0 pid=444) INFO 04-22 01:35:55 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(Worker_TP0 pid=444) INFO 04-22 01:35:55 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(Worker_TP1 pid=445) DEBUG 04-22 01:35:56 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP1 pid=445) DEBUG 04-22 01:35:56 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP1 pid=445) DEBUG 04-22 01:35:56 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP1 pid=445) DEBUG 04-22 01:35:56 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP0 pid=444) DEBUG 04-22 01:35:56 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP0 pid=444) DEBUG 04-22 01:35:56 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP0 pid=444) DEBUG 04-22 01:35:56 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP0 pid=444) DEBUG 04-22 01:35:56 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP0 pid=444) DEBUG 04-22 01:35:56 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00003-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'model-00002-of-00004.safetensors']] +(Worker_TP0 pid=444) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 +(Worker_TP0 pid=444) DEBUG 04-22 01:36:07 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 01:36:07 [compilation/decorators.py:528] Start compiling function +(EngineCore pid=245) DEBUG 04-22 01:36:08 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=f16cc641b7 comp=e546579c48 code=d4902a9f99bbc2392c45e4b4dc801f4424a11306edb95a61851b1062db20b8c4 dir=/data/.cache/vllm/torch_compile_cache/c9dc6a35ed/rank_1_0/backbone +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] Vllm config hash: f16cc641b7 +(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP0 pid=444) INFO 04-22 01:36:12 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/c9dc6a35ed/rank_0_0/backbone for vLLM's torch.compile +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=f16cc641b7 comp=e546579c48 code=d4902a9f99bbc2392c45e4b4dc801f4424a11306edb95a61851b1062db20b8c4 dir=/data/.cache/vllm/torch_compile_cache/c9dc6a35ed/rank_0_0/backbone +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] Vllm config hash: f16cc641b7 +(Worker_TP0 pid=444) INFO 04-22 01:36:12 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.63 s +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/.../fusion/allreduce_rms_fusion.py:765] Flashinfer max size: 64 MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: 8192 +(Worker_TP0 pid=444) INFO 04-22 01:36:12 [distributed/device_communicators/flashinfer_all_reduce.py:109] Auto-selected flashinfer allreduce backend: trtllm +(Worker_TP0 pid=444) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. +(Worker_TP0 pid=444) return func(*args, **kwargs) +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=0, max_token_num=8192, hidden_dim=4096, dtype=torch.bfloat16 +(Worker_TP1 pid=445) DEBUG 04-22 01:36:12 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=1, max_token_num=8192, hidden_dim=4096, dtype=torch.bfloat16 +(Worker_TP0 pid=444) INFO 04-22 01:36:12 [distributed/device_communicators/flashinfer_all_reduce.py:149] Initialized FlashInfer Allreduce norm fusion workspace with backend=trtllm +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(Worker_TP1 pid=445) DEBUG 04-22 01:36:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=445) DEBUG 04-22 01:36:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:36:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=444) DEBUG 04-22 01:36:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:36:13 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns +(Worker_TP1 pid=445) DEBUG 04-22 01:36:13 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 25.5 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:36:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:36:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes +(Worker_TP1 pid=445) DEBUG 04-22 01:36:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:36:13 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns +(Worker_TP0 pid=444) DEBUG 04-22 01:36:13 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 27.3 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:36:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:36:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes +(Worker_TP0 pid=444) DEBUG 04-22 01:36:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(APIServer pid=1) DEBUG 04-22 01:36:13 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=444) INFO 04-22 01:36:15 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(Worker_TP0 pid=444) DEBUG 04-22 01:36:15 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/c9dc6a35ed/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(Worker_TP0 pid=444) DEBUG 04-22 01:36:16 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=444) DEBUG 04-22 01:36:16 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:36:16 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=445) DEBUG 04-22 01:36:16 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:36:16 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP0 pid=444) DEBUG 04-22 01:36:16 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 38.8 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:36:16 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:36:16 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP0 pid=444) DEBUG 04-22 01:36:16 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:36:16 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP1 pid=445) DEBUG 04-22 01:36:16 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 38.8 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:36:16 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:36:16 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP1 pid=445) DEBUG 04-22 01:36:16 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:36:16 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/c9dc6a35ed/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(Worker_TP1 pid=445) DEBUG 04-22 01:36:18 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=445) DEBUG 04-22 01:36:18 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:36:18 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=444) DEBUG 04-22 01:36:18 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:36:18 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP1 pid=445) DEBUG 04-22 01:36:18 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.8 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:36:18 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:36:18 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP1 pid=445) DEBUG 04-22 01:36:18 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:36:18 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP0 pid=444) DEBUG 04-22 01:36:18 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 42.6 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:36:18 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:36:18 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP0 pid=444) DEBUG 04-22 01:36:18 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:36:18 [compilation/backends.py:377] Store the 32-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_32', '/data/.cache/vllm/torch_compile_cache/c9dc6a35ed/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_32') +(Worker_TP0 pid=444) INFO 04-22 01:36:18 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.85 s +(Worker_TP0 pid=444) DEBUG 04-22 01:36:18 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/c9dc6a35ed/rank_0_0/backbone/computation_graph.py +(Worker_TP0 pid=444) INFO 04-22 01:36:19 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/837da8fa190b7b367e1a3f76b3995bb570fece3b66a8838c9778e41806ef4a88/rank_0_0/model +(Worker_TP0 pid=444) INFO 04-22 01:36:19 [compilation/monitor.py:48] torch.compile took 11.94 s in total +(Worker_TP0 pid=444) INFO 04-22 01:36:19 [compilation/monitor.py:76] Initial profiling/warmup run took 0.17 s +(APIServer pid=1) DEBUG 04-22 01:36:23 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP1 pid=445) INFO 04-22 01:36:25 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP1 pid=445) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP1 pid=445) INFO 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_TP0 pid=444) INFO 04-22 01:36:25 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP0 pid=444) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP0 pid=444) INFO 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_TP1 pid=445) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 01:36:25 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=444) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 01:36:25 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=445) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 01:36:25 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=444) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 01:36:25 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=445) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 96.00 MiB first-capture + (51-1) × 26.00 MiB per-graph +(Worker_TP1 pid=445) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 96.00 MiB first-capture + (51-1) × 26.00 MiB per-graph +(Worker_TP0 pid=444) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 01:36:25 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=444) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 01:36:25 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=445) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 01:36:25 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=444) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 01:36:25 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=445) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 132.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(Worker_TP1 pid=445) INFO 04-22 01:36:25 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses +(Worker_TP0 pid=444) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 132.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(Worker_TP0 pid=444) INFO 04-22 01:36:25 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses +(Worker_TP1 pid=445) DEBUG 04-22 01:36:26 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP1 pid=445) INFO 04-22 01:36:26 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.69 GiB total +(Worker_TP0 pid=444) DEBUG 04-22 01:36:26 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP0 pid=444) INFO 04-22 01:36:26 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.69 GiB total +(Worker_TP1 pid=445) DEBUG 04-22 01:36:26 [v1/worker/gpu_worker.py:424] Initial free memory: 77.66 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP1 pid=445) DEBUG 04-22 01:36:26 [v1/worker/gpu_worker.py:430] Free memory after profiling: 67.66 GiB (total), 65.23 GiB (within requested) +(Worker_TP1 pid=445) DEBUG 04-22 01:36:26 [v1/worker/gpu_worker.py:435] Memory profiling takes 19.45 seconds. Total non KV cache memory: 11.47GiB; torch peak memory increase: 1.89GiB; non-torch forward increase memory: 2.07GiB; weights memory: 7.51GiB. +(Worker_TP1 pid=445) INFO 04-22 01:36:26 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9714 to maintain the same effective KV cache size. +(Worker_TP1 pid=445) DEBUG 04-22 01:36:26 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=444) DEBUG 04-22 01:36:26 [v1/worker/gpu_worker.py:424] Initial free memory: 77.66 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP0 pid=444) DEBUG 04-22 01:36:26 [v1/worker/gpu_worker.py:430] Free memory after profiling: 67.66 GiB (total), 65.23 GiB (within requested) +(Worker_TP0 pid=444) DEBUG 04-22 01:36:26 [v1/worker/gpu_worker.py:435] Memory profiling takes 19.38 seconds. Total non KV cache memory: 11.47GiB; torch peak memory increase: 1.89GiB; non-torch forward increase memory: 2.07GiB; weights memory: 7.51GiB. +(Worker_TP0 pid=444) INFO 04-22 01:36:26 [v1/worker/gpu_worker.py:436] Available KV cache memory: 63.76 GiB +(Worker_TP0 pid=444) INFO 04-22 01:36:26 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9714 to maintain the same effective KV cache size. +(Worker_TP0 pid=444) DEBUG 04-22 01:36:26 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=245) DEBUG 04-22 01:36:26 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=245) INFO 04-22 01:36:26 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 1,044,688 tokens +(EngineCore pid=245) INFO 04-22 01:36:26 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 127.53x +(Worker_TP0 pid=444) DEBUG 04-22 01:36:26 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=445) DEBUG 04-22 01:36:26 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=444) 2026-04-22 01:36:26,785 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP1 pid=445) 2026-04-22 01:36:26,785 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP1 pid=445) DEBUG 04-22 01:36:26 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 01:36:26 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) 2026-04-22 01:36:26,799 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP0 pid=444) 2026-04-22 01:36:26,800 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP0 pid=444) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=245) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=245) DEBUG 04-22 01:36:32 [config/vllm.py:1530] Max num batched tokens below allreduce-rms fusion threshold, allreduce-rms fusion will be enabled for all num_tokens. +(EngineCore pid=245) INFO 04-22 01:36:32 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(Worker_TP0 pid=444) DEBUG 04-22 01:36:32 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=445) DEBUG 04-22 01:36:32 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=245) DEBUG 04-22 01:36:32 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=245) DEBUG 04-22 01:36:32 [v1/engine/core.py:1158] EngineCore waiting for work. +(EngineCore pid=245) DEBUG 04-22 01:36:32 [v1/engine/core.py:1158] EngineCore waiting for work. +(APIServer pid=1) INFO 04-22 01:36:32 [entrypoints/openai/api_server.py:590] Supported tasks: ['generate'] +(APIServer pid=1) WARNING 04-22 01:36:32 [config/model.py:1435] Default vLLM sampling parameters have been overridden by the model's `generation_config.json`: `{'temperature': 0.6, 'top_p': 0.9}`. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`. +(APIServer pid=1) DEBUG 04-22 01:36:33 [renderers/base.py:197] Warming up chat template processing... +(APIServer pid=1) DEBUG 04-22 01:36:33 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e82621-0d095a92633056173e92f6bd;3df6e5c7-c042-4e34-b6ad-faa54bd55108) +(APIServer pid=1) DEBUG 04-22 01:36:33 [transformers_utils/repo_utils.py:243] +(APIServer pid=1) DEBUG 04-22 01:36:33 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/resolve/main/processor_config.json. +(APIServer pid=1) DEBUG 04-22 01:36:33 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e82621-5f9cf38e349ad92500e3d5cf;75af7b6e-94c8-4a06-877a-3fd56f8c5edb) +(APIServer pid=1) DEBUG 04-22 01:36:33 [transformers_utils/repo_utils.py:243] +(APIServer pid=1) DEBUG 04-22 01:36:33 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/resolve/main/preprocessor_config.json. +(Worker_TP1 pid=445) DEBUG 04-22 01:36:33 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=444) DEBUG 04-22 01:36:33 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(APIServer pid=1) INFO 04-22 01:36:33 [renderers/hf.py:314] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this. +(APIServer pid=1) DEBUG 04-22 01:36:33 [renderers/base.py:203] Chat template warmup completed in 0.934s +(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/openai/api_server.py:594] Starting vLLM server on http://0.0.0.0:8000 +(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:37] Available routes are: +(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /openapi.json, Methods: HEAD, GET +(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /docs, Methods: HEAD, GET +(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /docs/oauth2-redirect, Methods: HEAD, GET +(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /redoc, Methods: HEAD, GET +(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /tokenize, Methods: POST +(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /detokenize, Methods: POST +(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /load, Methods: GET +(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /version, Methods: GET +(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /health, Methods: GET +(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /metrics, Methods: GET +(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /v1/models, Methods: GET +(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /ping, Methods: GET +(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /ping, Methods: POST +(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /invocations, Methods: POST +(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /v1/chat/completions, Methods: POST +(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST +(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /v1/responses, Methods: POST +(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET +(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST +(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /v1/completions, Methods: POST +(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /v1/messages, Methods: POST +(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST +(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /inference/v1/generate, Methods: POST +(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /scale_elastic_ep, Methods: POST +(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST +(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /v1/chat/completions/render, Methods: POST +(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /v1/completions/render, Methods: POST +(APIServer pid=1) INFO: Started server process [1] +(APIServer pid=1) INFO: Waiting for application startup. +(APIServer pid=1) INFO: Application startup complete. +(APIServer pid=1) DEBUG 04-22 01:36:38 [v1/engine/async_llm.py:875] Called check_health. +(APIServer pid=1) INFO: 10.129.8.2:49764 - "GET /health HTTP/1.1" 200 OK diff --git a/accuracy/results/v0.19.0/logs/meta-llama-llama-3-1-8b---h100-80gb--tp4pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/meta-llama-llama-3-1-8b---h100-80gb--tp4pp1dp1--8192.log new file mode 100644 index 00000000..545c678c --- /dev/null +++ b/accuracy/results/v0.19.0/logs/meta-llama-llama-3-1-8b---h100-80gb--tp4pp1dp1--8192.log @@ -0,0 +1,2768 @@ +DEBUG 04-22 01:36:52 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:36:52 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:36:52 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:36:52 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:36:52 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:36:52 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:36:52 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:36:52 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:36:52 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:36:52 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:36:52 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:36:52 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:36:57 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:36:59 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' +DEBUG 04-22 01:36:59 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:36:59 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:36:59 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:36:59 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 01:36:59 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:36:59 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 01:36:59 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 01:36:59 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-3.1-8B-Instruct +(APIServer pid=1) INFO 04-22 01:36:59 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 01:36:59 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:36:59 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-3.1-8B-Instruct', 'model': 'meta-llama/Llama-3.1-8B-Instruct', 'max_model_len': 8192, 'tensor_parallel_size': 4, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 01:36:59 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 01:36:59 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 01:36:59 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0004390 secs +(APIServer pid=1) INFO 04-22 01:36:59 [config/model.py:549] Resolved architecture: LlamaForCausalLM +(APIServer pid=1) INFO 04-22 01:36:59 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 01:36:59 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 01:36:59 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 01:36:59 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 01:36:59 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 01:36:59 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 01:36:59 [config/parallel.py:743] Defaulting to use mp for distributed inference +(APIServer pid=1) INFO 04-22 01:36:59 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 01:36:59 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(APIServer pid=1) INFO 04-22 01:37:01 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(APIServer pid=1) DEBUG 04-22 01:37:01 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 01:37:01 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:37:02 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:37:02 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 01:37:05 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:37:05 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:37:05 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:37:05 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:37:05 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:37:05 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:37:05 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:37:05 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:37:05 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:37:05 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:37:05 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:37:05 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:37:10 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=244) DEBUG 04-22 01:37:11 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 01:37:11 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=244) DEBUG 04-22 01:37:11 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/47561f57-e3b4-4886-86f4-b8d671d8d58a'], outputs=['ipc:///tmp/f27c5513-b1bf-46b8-9e3d-d88f9838a989'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=244) DEBUG 04-22 01:37:11 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=244) DEBUG 04-22 01:37:11 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=244) DEBUG 04-22 01:37:11 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=244) DEBUG 04-22 01:37:11 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=244) DEBUG 04-22 01:37:11 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=244) INFO 04-22 01:37:11 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=4, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [256, 8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=244) WARNING 04-22 01:37:11 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. +(EngineCore pid=244) INFO 04-22 01:37:11 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.131.3.123 (local), world_size=4, local_world_size=4 +(EngineCore pid=244) DEBUG 04-22 01:37:11 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/cb2ab28f-0990-478b-8791-16db02f501c6 +(EngineCore pid=244) DEBUG 04-22 01:37:11 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1, 2, 3], buffer_handle=(4, 16777216, 10, 'psm_ba73b0bc'), local_subscribe_addr='ipc:///tmp/cb2ab28f-0990-478b-8791-16db02f501c6', local_notify_addr='ipc:///tmp/dd6de613-afe4-4559-8d5e-5ee043f1a2c4', remote_subscribe_addr=None, remote_addr_ipv6=False) +DEBUG 04-22 01:37:15 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:37:15 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:37:15 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:37:15 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:37:15 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:37:15 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:37:15 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:37:15 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:37:15 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:37:15 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:37:15 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:37:15 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:37:15 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:37:15 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:37:15 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:37:15 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:37:15 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:37:15 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:37:15 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:37:15 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:37:15 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:37:15 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:37:15 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:37:15 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:37:15 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:37:15 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:37:15 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:37:15 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:37:15 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:37:15 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:37:15 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:37:15 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:37:15 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:37:15 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:37:15 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:37:15 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:37:15 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:37:15 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:37:15 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:37:15 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:37:15 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:37:15 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:37:15 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:37:15 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:37:15 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:37:15 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:37:15 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:37:15 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:37:20 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:37:20 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:37:20 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:37:20 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:37:21 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:37:21 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:37:21 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:37:21 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 01:37:21 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:37:21 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:37:21 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:37:21 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) DEBUG 04-22 01:37:21 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +DEBUG 04-22 01:37:22 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:37:22 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:37:22 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:37:22 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 01:37:22 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:37:22 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:37:22 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:37:22 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(Worker pid=446) DEBUG 04-22 01:37:22 [distributed/parallel_state.py:1356] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:36977 backend=nccl +(Worker pid=446) INFO 04-22 01:37:22 [distributed/parallel_state.py:1400] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:36977 backend=nccl +(Worker pid=443) DEBUG 04-22 01:37:22 [distributed/parallel_state.py:1356] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:36977 backend=nccl +(Worker pid=443) INFO 04-22 01:37:22 [distributed/parallel_state.py:1400] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:36977 backend=nccl +(Worker pid=444) DEBUG 04-22 01:37:22 [distributed/parallel_state.py:1356] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:36977 backend=nccl +(Worker pid=444) INFO 04-22 01:37:22 [distributed/parallel_state.py:1400] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:36977 backend=nccl +(Worker pid=445) DEBUG 04-22 01:37:23 [distributed/parallel_state.py:1356] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:36977 backend=nccl +(Worker pid=445) INFO 04-22 01:37:23 [distributed/parallel_state.py:1400] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:36977 backend=nccl +[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +(Worker pid=443) DEBUG 04-22 01:37:23 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=445) DEBUG 04-22 01:37:23 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=444) DEBUG 04-22 01:37:23 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=446) DEBUG 04-22 01:37:23 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +(Worker pid=444) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=444) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=445) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=446) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=443) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=445) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=446) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=443) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=443) DEBUG 04-22 01:37:24 [utils/nccl.py:34] Found nccl from library libnccl.so.2 +(Worker pid=443) INFO 04-22 01:37:24 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 +(Worker pid=446) DEBUG 04-22 01:37:27 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=443) DEBUG 04-22 01:37:27 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=444) DEBUG 04-22 01:37:27 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=445) DEBUG 04-22 01:37:27 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=443) DEBUG 04-22 01:37:27 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/bae3bdc8-aa09-47ea-b15a-66559cba742a +(Worker pid=443) DEBUG 04-22 01:37:27 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1, 2, 3], buffer_handle=(3, 4194304, 6, 'psm_91c73355'), local_subscribe_addr='ipc:///tmp/bae3bdc8-aa09-47ea-b15a-66559cba742a', local_notify_addr='ipc:///tmp/cf6d7708-31bf-4b03-aabf-be42f5d638ec', remote_subscribe_addr=None, remote_addr_ipv6=False) +(Worker pid=444) DEBUG 04-22 01:37:27 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/bae3bdc8-aa09-47ea-b15a-66559cba742a +(Worker pid=446) DEBUG 04-22 01:37:27 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/bae3bdc8-aa09-47ea-b15a-66559cba742a +(Worker pid=445) DEBUG 04-22 01:37:27 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/bae3bdc8-aa09-47ea-b15a-66559cba742a +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(Worker pid=443) INFO 04-22 01:37:27 [distributed/parallel_state.py:1716] rank 0 in world size 4 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(Worker pid=443) DEBUG 04-22 01:37:28 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776821848.1358237, auto_measure=True +(Worker pid=443) DEBUG 04-22 01:37:28 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=445) DEBUG 04-22 01:37:28 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776821848.20137, auto_measure=True +(Worker pid=445) DEBUG 04-22 01:37:28 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=443) DEBUG 04-22 01:37:28 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=443) DEBUG 04-22 01:37:28 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=446) DEBUG 04-22 01:37:28 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776821848.2335331, auto_measure=True +(Worker pid=446) DEBUG 04-22 01:37:28 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=443) DEBUG 04-22 01:37:28 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=444) DEBUG 04-22 01:37:28 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776821848.248634, auto_measure=True +(Worker pid=444) DEBUG 04-22 01:37:28 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=443) DEBUG 04-22 01:37:28 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=443) DEBUG 04-22 01:37:28 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(Worker pid=443) DEBUG 04-22 01:37:28 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=445) DEBUG 04-22 01:37:28 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=445) DEBUG 04-22 01:37:28 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=445) DEBUG 04-22 01:37:28 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=443) DEBUG 04-22 01:37:28 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(Worker_TP0 pid=443) INFO 04-22 01:37:28 [v1/worker/gpu_model_runner.py:4735] Starting to load model meta-llama/Llama-3.1-8B-Instruct... +(Worker pid=446) DEBUG 04-22 01:37:28 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=446) DEBUG 04-22 01:37:28 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=445) DEBUG 04-22 01:37:28 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=446) DEBUG 04-22 01:37:28 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=444) DEBUG 04-22 01:37:28 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=444) DEBUG 04-22 01:37:28 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=445) DEBUG 04-22 01:37:28 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=444) DEBUG 04-22 01:37:28 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=446) DEBUG 04-22 01:37:28 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=446) DEBUG 04-22 01:37:28 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=444) DEBUG 04-22 01:37:28 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=444) DEBUG 04-22 01:37:28 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker_TP0 pid=443) DEBUG 04-22 01:37:28 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(Worker_TP0 pid=443) INFO 04-22 01:37:28 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(Worker_TP0 pid=443) INFO 04-22 01:37:28 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(Worker_TP0 pid=443) DEBUG 04-22 01:37:28 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP0 pid=443) DEBUG 04-22 01:37:28 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP0 pid=443) DEBUG 04-22 01:37:28 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP0 pid=443) DEBUG 04-22 01:37:28 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP2 pid=445) DEBUG 04-22 01:37:28 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP2 pid=445) DEBUG 04-22 01:37:28 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP2 pid=445) DEBUG 04-22 01:37:28 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP2 pid=445) DEBUG 04-22 01:37:28 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP3 pid=446) DEBUG 04-22 01:37:28 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP3 pid=446) DEBUG 04-22 01:37:28 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP3 pid=446) DEBUG 04-22 01:37:28 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP3 pid=446) DEBUG 04-22 01:37:28 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP1 pid=444) DEBUG 04-22 01:37:28 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP1 pid=444) DEBUG 04-22 01:37:28 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP1 pid=444) DEBUG 04-22 01:37:28 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP1 pid=444) DEBUG 04-22 01:37:28 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP0 pid=443) DEBUG 04-22 01:37:29 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00004-of-00004.safetensors']] +(Worker_TP2 pid=445) DEBUG 04-22 01:37:29 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00001-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00002-of-00004.safetensors']] +(Worker_TP0 pid=443) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 +(Worker_TP2 pid=445) DEBUG 04-22 01:37:39 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=445) DEBUG 04-22 01:37:39 [compilation/decorators.py:528] Start compiling function +(Worker_TP3 pid=446) DEBUG 04-22 01:37:39 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=444) DEBUG 04-22 01:37:39 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=446) DEBUG 04-22 01:37:39 [compilation/decorators.py:528] Start compiling function +(Worker_TP1 pid=444) DEBUG 04-22 01:37:39 [compilation/decorators.py:528] Start compiling function +(EngineCore pid=244) DEBUG 04-22 01:37:40 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(APIServer pid=1) DEBUG 04-22 01:37:42 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP0 pid=443) INFO 04-22 01:37:44 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/d42086f0a8/rank_0_0/backbone for vLLM's torch.compile +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=3f19a9810e comp=e546579c48 code=d4902a9f99bbc2392c45e4b4dc801f4424a11306edb95a61851b1062db20b8c4 dir=/data/.cache/vllm/torch_compile_cache/d42086f0a8/rank_0_0/backbone +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] Vllm config hash: 3f19a9810e +(Worker_TP0 pid=443) INFO 04-22 01:37:44 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.45 s +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/.../fusion/allreduce_rms_fusion.py:765] Flashinfer max size: 2 MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: 256 +(Worker_TP0 pid=443) INFO 04-22 01:37:44 [distributed/device_communicators/flashinfer_all_reduce.py:109] Auto-selected flashinfer allreduce backend: trtllm +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=3f19a9810e comp=e546579c48 code=d4902a9f99bbc2392c45e4b4dc801f4424a11306edb95a61851b1062db20b8c4 dir=/data/.cache/vllm/torch_compile_cache/d42086f0a8/rank_3_0/backbone +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] Vllm config hash: 3f19a9810e +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=3f19a9810e comp=e546579c48 code=d4902a9f99bbc2392c45e4b4dc801f4424a11306edb95a61851b1062db20b8c4 dir=/data/.cache/vllm/torch_compile_cache/d42086f0a8/rank_1_0/backbone +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=3f19a9810e comp=e546579c48 code=d4902a9f99bbc2392c45e4b4dc801f4424a11306edb95a61851b1062db20b8c4 dir=/data/.cache/vllm/torch_compile_cache/d42086f0a8/rank_2_0/backbone +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] Vllm config hash: 3f19a9810e +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] Vllm config hash: 3f19a9810e +(Worker_TP0 pid=443) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. +(Worker_TP0 pid=443) return func(*args, **kwargs) +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=0, max_token_num=256, hidden_dim=4096, dtype=torch.bfloat16 +(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=1, max_token_num=256, hidden_dim=4096, dtype=torch.bfloat16 +(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=2, max_token_num=256, hidden_dim=4096, dtype=torch.bfloat16 +(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=3, max_token_num=256, hidden_dim=4096, dtype=torch.bfloat16 +(Worker_TP0 pid=443) INFO 04-22 01:37:44 [distributed/device_communicators/flashinfer_all_reduce.py:149] Initialized FlashInfer Allreduce norm fusion workspace with backend=trtllm +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 256), (257, 8192)] +(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(Worker_TP0 pid=443) DEBUG 04-22 01:37:45 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=443) DEBUG 04-22 01:37:45 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:37:45 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=444) DEBUG 04-22 01:37:45 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP3 pid=446) DEBUG 04-22 01:37:45 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP3 pid=446) DEBUG 04-22 01:37:45 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:37:45 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns +(Worker_TP0 pid=443) DEBUG 04-22 01:37:45 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 24.0 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:37:45 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:37:45 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes +(Worker_TP2 pid=445) DEBUG 04-22 01:37:45 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=443) DEBUG 04-22 01:37:45 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP2 pid=445) DEBUG 04-22 01:37:45 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:37:45 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns +(Worker_TP1 pid=444) DEBUG 04-22 01:37:45 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 25.1 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:37:45 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:37:45 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes +(Worker_TP1 pid=444) DEBUG 04-22 01:37:45 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP3 pid=446) DEBUG 04-22 01:37:45 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns +(Worker_TP3 pid=446) DEBUG 04-22 01:37:45 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 25.2 ms +(Worker_TP3 pid=446) DEBUG 04-22 01:37:45 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP3 pid=446) DEBUG 04-22 01:37:45 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes +(Worker_TP3 pid=446) DEBUG 04-22 01:37:45 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP2 pid=445) DEBUG 04-22 01:37:45 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns +(Worker_TP2 pid=445) DEBUG 04-22 01:37:45 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 27.2 ms +(Worker_TP2 pid=445) DEBUG 04-22 01:37:45 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP2 pid=445) DEBUG 04-22 01:37:45 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes +(Worker_TP2 pid=445) DEBUG 04-22 01:37:45 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=443) INFO 04-22 01:37:47 [compilation/backends.py:372] Cache the graph of compile range (1, 256) for later use +(Worker_TP0 pid=443) DEBUG 04-22 01:37:47 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 256) from inductor_standalone via handle ('artifact_compile_range_1_256_subgraph_0', '/data/.cache/vllm/torch_compile_cache/d42086f0a8/rank_0_0/backbone/artifact_compile_range_1_256_subgraph_0') +(Worker_TP0 pid=443) DEBUG 04-22 01:37:48 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=443) DEBUG 04-22 01:37:48 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:37:48 [compilation/passes/pass_manager.py:100] Skipping with compile range (257, 8192) +(Worker_TP0 pid=443) DEBUG 04-22 01:37:48 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.6 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:37:48 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=443) DEBUG 04-22 01:37:48 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP3 pid=446) DEBUG 04-22 01:37:48 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP3 pid=446) DEBUG 04-22 01:37:48 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP3 pid=446) DEBUG 04-22 01:37:48 [compilation/passes/pass_manager.py:100] Skipping with compile range (257, 8192) +(Worker_TP3 pid=446) DEBUG 04-22 01:37:48 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP3 pid=446) DEBUG 04-22 01:37:48 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP3 pid=446) DEBUG 04-22 01:37:48 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:37:48 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=444) DEBUG 04-22 01:37:48 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:37:48 [compilation/passes/pass_manager.py:100] Skipping with compile range (257, 8192) +(Worker_TP1 pid=444) DEBUG 04-22 01:37:48 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:37:48 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=444) DEBUG 04-22 01:37:48 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP2 pid=445) DEBUG 04-22 01:37:48 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP2 pid=445) DEBUG 04-22 01:37:48 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP2 pid=445) DEBUG 04-22 01:37:48 [compilation/passes/pass_manager.py:100] Skipping with compile range (257, 8192) +(Worker_TP2 pid=445) DEBUG 04-22 01:37:48 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP2 pid=445) DEBUG 04-22 01:37:48 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP2 pid=445) DEBUG 04-22 01:37:48 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP0 pid=443) INFO 04-22 01:37:49 [compilation/backends.py:372] Cache the graph of compile range (257, 8192) for later use +(Worker_TP0 pid=443) DEBUG 04-22 01:37:49 [compilation/backends.py:377] Store the 0-th graph for compile range(257, 8192) from inductor_standalone via handle ('artifact_compile_range_257_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/d42086f0a8/rank_0_0/backbone/artifact_compile_range_257_8192_subgraph_0') +(Worker_TP0 pid=443) DEBUG 04-22 01:37:49 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=443) DEBUG 04-22 01:37:49 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.1 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:37:49 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP0 pid=443) DEBUG 04-22 01:37:49 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 36.5 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:37:49 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:37:49 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP0 pid=443) DEBUG 04-22 01:37:49 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:37:49 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=444) DEBUG 04-22 01:37:49 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP3 pid=446) DEBUG 04-22 01:37:49 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP3 pid=446) DEBUG 04-22 01:37:49 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:37:49 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP1 pid=444) DEBUG 04-22 01:37:49 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 37.2 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:37:49 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:37:49 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP1 pid=444) DEBUG 04-22 01:37:49 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP3 pid=446) DEBUG 04-22 01:37:49 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP3 pid=446) DEBUG 04-22 01:37:49 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 37.9 ms +(Worker_TP3 pid=446) DEBUG 04-22 01:37:49 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP3 pid=446) DEBUG 04-22 01:37:49 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP3 pid=446) DEBUG 04-22 01:37:49 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP2 pid=445) DEBUG 04-22 01:37:49 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP2 pid=445) DEBUG 04-22 01:37:49 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP2 pid=445) DEBUG 04-22 01:37:49 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP2 pid=445) DEBUG 04-22 01:37:49 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 38.7 ms +(Worker_TP2 pid=445) DEBUG 04-22 01:37:49 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP2 pid=445) DEBUG 04-22 01:37:49 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP2 pid=445) DEBUG 04-22 01:37:49 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:37:50 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 256) from inductor_standalone via handle ('artifact_compile_range_1_256_subgraph_1', '/data/.cache/vllm/torch_compile_cache/d42086f0a8/rank_0_0/backbone/artifact_compile_range_1_256_subgraph_1') +(Worker_TP0 pid=443) DEBUG 04-22 01:37:50 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=443) DEBUG 04-22 01:37:50 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:37:50 [compilation/passes/pass_manager.py:100] Skipping with compile range (257, 8192) +(Worker_TP0 pid=443) DEBUG 04-22 01:37:50 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:37:50 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=443) DEBUG 04-22 01:37:50 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP3 pid=446) DEBUG 04-22 01:37:50 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP3 pid=446) DEBUG 04-22 01:37:50 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP3 pid=446) DEBUG 04-22 01:37:50 [compilation/passes/pass_manager.py:100] Skipping with compile range (257, 8192) +(Worker_TP3 pid=446) DEBUG 04-22 01:37:50 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP3 pid=446) DEBUG 04-22 01:37:50 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP3 pid=446) DEBUG 04-22 01:37:50 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:37:50 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=444) DEBUG 04-22 01:37:50 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:37:50 [compilation/passes/pass_manager.py:100] Skipping with compile range (257, 8192) +(Worker_TP1 pid=444) DEBUG 04-22 01:37:50 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:37:50 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=444) DEBUG 04-22 01:37:50 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP2 pid=445) DEBUG 04-22 01:37:51 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP2 pid=445) DEBUG 04-22 01:37:51 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP2 pid=445) DEBUG 04-22 01:37:51 [compilation/passes/pass_manager.py:100] Skipping with compile range (257, 8192) +(Worker_TP2 pid=445) DEBUG 04-22 01:37:51 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP2 pid=445) DEBUG 04-22 01:37:51 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP2 pid=445) DEBUG 04-22 01:37:51 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(APIServer pid=1) DEBUG 04-22 01:37:52 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=443) DEBUG 04-22 01:37:52 [compilation/backends.py:377] Store the 1-th graph for compile range(257, 8192) from inductor_standalone via handle ('artifact_compile_range_257_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/d42086f0a8/rank_0_0/backbone/artifact_compile_range_257_8192_subgraph_1') +(Worker_TP0 pid=443) DEBUG 04-22 01:37:54 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=443) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:37:54 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP0 pid=443) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 37.5 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:37:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP0 pid=443) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:37:54 [compilation/backends.py:377] Store the 32-th graph for compile range(1, 256) from inductor_standalone via handle ('artifact_compile_range_1_256_subgraph_32', '/data/.cache/vllm/torch_compile_cache/d42086f0a8/rank_0_0/backbone/artifact_compile_range_1_256_subgraph_32') +(Worker_TP0 pid=443) INFO 04-22 01:37:54 [compilation/backends.py:390] Compiling a graph for compile range (1, 256) takes 6.41 s +(Worker_TP1 pid=444) DEBUG 04-22 01:37:54 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=444) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:37:54 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP1 pid=444) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 41.0 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:37:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP1 pid=444) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP3 pid=446) DEBUG 04-22 01:37:54 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP3 pid=446) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP3 pid=446) DEBUG 04-22 01:37:54 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP3 pid=446) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 40.4 ms +(Worker_TP3 pid=446) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP3 pid=446) DEBUG 04-22 01:37:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP3 pid=446) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:37:54 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=443) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:37:54 [compilation/passes/pass_manager.py:100] Skipping with compile range (257, 8192) +(Worker_TP0 pid=443) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:37:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=443) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(Worker_TP2 pid=445) DEBUG 04-22 01:37:54 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP2 pid=445) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP2 pid=445) DEBUG 04-22 01:37:54 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP2 pid=445) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 38.7 ms +(Worker_TP2 pid=445) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms +(Worker_TP2 pid=445) DEBUG 04-22 01:37:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP2 pid=445) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:37:54 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=444) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:37:54 [compilation/passes/pass_manager.py:100] Skipping with compile range (257, 8192) +(Worker_TP1 pid=444) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:37:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=444) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(Worker_TP3 pid=446) DEBUG 04-22 01:37:54 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP3 pid=446) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP3 pid=446) DEBUG 04-22 01:37:54 [compilation/passes/pass_manager.py:100] Skipping with compile range (257, 8192) +(Worker_TP3 pid=446) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP3 pid=446) DEBUG 04-22 01:37:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP3 pid=446) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(Worker_TP2 pid=445) DEBUG 04-22 01:37:54 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP2 pid=445) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(Worker_TP2 pid=445) DEBUG 04-22 01:37:54 [compilation/passes/pass_manager.py:100] Skipping with compile range (257, 8192) +(Worker_TP2 pid=445) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP2 pid=445) DEBUG 04-22 01:37:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP2 pid=445) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:37:54 [compilation/backends.py:377] Store the 32-th graph for compile range(257, 8192) from inductor_standalone via handle ('artifact_compile_range_257_8192_subgraph_32', '/data/.cache/vllm/torch_compile_cache/d42086f0a8/rank_0_0/backbone/artifact_compile_range_257_8192_subgraph_32') +(Worker_TP0 pid=443) INFO 04-22 01:37:54 [compilation/backends.py:390] Compiling a graph for compile range (257, 8192) takes 7.03 s +(Worker_TP0 pid=443) DEBUG 04-22 01:37:55 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/d42086f0a8/rank_0_0/backbone/computation_graph.py +(Worker_TP0 pid=443) INFO 04-22 01:37:56 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/cadcd07743782ae93f16f13d5b370fa694cecf280786fe0de34a40c069d9e40d/rank_0_0/model +(Worker_TP0 pid=443) INFO 04-22 01:37:56 [compilation/monitor.py:48] torch.compile took 16.38 s in total +(Worker_TP0 pid=443) INFO 04-22 01:37:56 [compilation/monitor.py:76] Initial profiling/warmup run took 0.71 s +(APIServer pid=1) DEBUG 04-22 01:38:02 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=443) INFO 04-22 01:38:02 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP0 pid=443) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP0 pid=443) INFO 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_TP1 pid=444) INFO 04-22 01:38:02 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP1 pid=444) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP1 pid=444) INFO 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_TP3 pid=446) INFO 04-22 01:38:02 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP3 pid=446) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP3 pid=446) INFO 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_TP2 pid=445) INFO 04-22 01:38:02 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP2 pid=445) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP2 pid=445) INFO 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_TP0 pid=443) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=443) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=443) DEBUG 04-22 01:38:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=443) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=443) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=443) DEBUG 04-22 01:38:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=443) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 84.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(Worker_TP0 pid=443) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=443) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=443) DEBUG 04-22 01:38:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=443) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=443) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=443) DEBUG 04-22 01:38:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=443) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 70.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(Worker_TP0 pid=443) INFO 04-22 01:38:02 [distributed/device_communicators/custom_all_reduce.py:216] Registering 260 cuda graph addresses +(Worker_TP1 pid=444) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=444) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=444) DEBUG 04-22 01:38:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=444) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=444) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=444) DEBUG 04-22 01:38:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP3 pid=446) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=444) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 84.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(Worker_TP1 pid=444) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=446) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=446) DEBUG 04-22 01:38:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=444) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=444) DEBUG 04-22 01:38:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=444) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=445) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=446) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=444) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=444) DEBUG 04-22 01:38:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP3 pid=446) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=446) DEBUG 04-22 01:38:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=444) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 70.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(Worker_TP1 pid=444) INFO 04-22 01:38:02 [distributed/device_communicators/custom_all_reduce.py:216] Registering 260 cuda graph addresses +(Worker_TP2 pid=445) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=445) DEBUG 04-22 01:38:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP3 pid=446) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 84.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(Worker_TP3 pid=446) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=446) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=446) DEBUG 04-22 01:38:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP2 pid=445) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=445) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=445) DEBUG 04-22 01:38:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP3 pid=446) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=446) DEBUG 04-22 01:38:03 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=446) DEBUG 04-22 01:38:03 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP2 pid=445) DEBUG 04-22 01:38:03 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 84.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(Worker_TP2 pid=445) DEBUG 04-22 01:38:03 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=446) DEBUG 04-22 01:38:03 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 70.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(Worker_TP3 pid=446) INFO 04-22 01:38:03 [distributed/device_communicators/custom_all_reduce.py:216] Registering 260 cuda graph addresses +(Worker_TP2 pid=445) DEBUG 04-22 01:38:03 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=445) DEBUG 04-22 01:38:03 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP2 pid=445) DEBUG 04-22 01:38:03 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=445) DEBUG 04-22 01:38:03 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=445) DEBUG 04-22 01:38:03 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP2 pid=445) DEBUG 04-22 01:38:03 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 70.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(Worker_TP2 pid=445) INFO 04-22 01:38:03 [distributed/device_communicators/custom_all_reduce.py:216] Registering 260 cuda graph addresses +(Worker_TP0 pid=443) DEBUG 04-22 01:38:03 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP0 pid=443) INFO 04-22 01:38:03 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.57 GiB total +(Worker_TP2 pid=445) DEBUG 04-22 01:38:03 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP2 pid=445) INFO 04-22 01:38:03 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.57 GiB total +(Worker_TP1 pid=444) DEBUG 04-22 01:38:03 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP1 pid=444) INFO 04-22 01:38:03 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.57 GiB total +(Worker_TP3 pid=446) DEBUG 04-22 01:38:03 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP3 pid=446) INFO 04-22 01:38:03 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.57 GiB total +(Worker_TP0 pid=443) DEBUG 04-22 01:38:03 [v1/worker/gpu_worker.py:424] Initial free memory: 77.64 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP0 pid=443) DEBUG 04-22 01:38:03 [v1/worker/gpu_worker.py:430] Free memory after profiling: 71.27 GiB (total), 68.87 GiB (within requested) +(Worker_TP0 pid=443) DEBUG 04-22 01:38:03 [v1/worker/gpu_worker.py:435] Memory profiling takes 23.99 seconds. Total non KV cache memory: 7.79GiB; torch peak memory increase: 1.89GiB; non-torch forward increase memory: 2.13GiB; weights memory: 3.77GiB. +(Worker_TP0 pid=443) INFO 04-22 01:38:03 [v1/worker/gpu_worker.py:436] Available KV cache memory: 67.44 GiB +(Worker_TP0 pid=443) INFO 04-22 01:38:03 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9572 to maintain the same effective KV cache size. +(Worker_TP0 pid=443) DEBUG 04-22 01:38:03 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=244) DEBUG 04-22 01:38:03 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=244) DEBUG 04-22 01:38:03 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP2 pid=445) DEBUG 04-22 01:38:04 [v1/worker/gpu_worker.py:424] Initial free memory: 77.64 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP2 pid=445) DEBUG 04-22 01:38:04 [v1/worker/gpu_worker.py:430] Free memory after profiling: 71.27 GiB (total), 68.87 GiB (within requested) +(Worker_TP2 pid=445) DEBUG 04-22 01:38:04 [v1/worker/gpu_worker.py:435] Memory profiling takes 24.21 seconds. Total non KV cache memory: 7.79GiB; torch peak memory increase: 1.89GiB; non-torch forward increase memory: 2.13GiB; weights memory: 3.77GiB. +(Worker_TP2 pid=445) INFO 04-22 01:38:04 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9572 to maintain the same effective KV cache size. +(Worker_TP2 pid=445) DEBUG 04-22 01:38:04 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=444) DEBUG 04-22 01:38:04 [v1/worker/gpu_worker.py:424] Initial free memory: 77.64 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP1 pid=444) DEBUG 04-22 01:38:04 [v1/worker/gpu_worker.py:430] Free memory after profiling: 71.27 GiB (total), 68.87 GiB (within requested) +(Worker_TP1 pid=444) DEBUG 04-22 01:38:04 [v1/worker/gpu_worker.py:435] Memory profiling takes 24.20 seconds. Total non KV cache memory: 7.79GiB; torch peak memory increase: 1.89GiB; non-torch forward increase memory: 2.13GiB; weights memory: 3.77GiB. +(Worker_TP1 pid=444) INFO 04-22 01:38:04 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9572 to maintain the same effective KV cache size. +(Worker_TP1 pid=444) DEBUG 04-22 01:38:04 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=244) DEBUG 04-22 01:38:04 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=244) DEBUG 04-22 01:38:04 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP3 pid=446) DEBUG 04-22 01:38:04 [v1/worker/gpu_worker.py:424] Initial free memory: 77.64 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP3 pid=446) DEBUG 04-22 01:38:04 [v1/worker/gpu_worker.py:430] Free memory after profiling: 71.27 GiB (total), 68.87 GiB (within requested) +(Worker_TP3 pid=446) DEBUG 04-22 01:38:04 [v1/worker/gpu_worker.py:435] Memory profiling takes 24.21 seconds. Total non KV cache memory: 7.79GiB; torch peak memory increase: 1.89GiB; non-torch forward increase memory: 2.13GiB; weights memory: 3.77GiB. +(Worker_TP3 pid=446) INFO 04-22 01:38:04 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9572 to maintain the same effective KV cache size. +(Worker_TP3 pid=446) DEBUG 04-22 01:38:04 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=244) DEBUG 04-22 01:38:04 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=244) INFO 04-22 01:38:04 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 2,209,792 tokens +(EngineCore pid=244) INFO 04-22 01:38:04 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 269.75x +(Worker_TP0 pid=443) DEBUG 04-22 01:38:04 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP2 pid=445) DEBUG 04-22 01:38:04 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=444) DEBUG 04-22 01:38:04 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP3 pid=446) DEBUG 04-22 01:38:04 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP3 pid=446) 2026-04-22 01:38:04,195 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP1 pid=444) 2026-04-22 01:38:04,195 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP2 pid=445) 2026-04-22 01:38:04,195 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP0 pid=443) 2026-04-22 01:38:04,196 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP1 pid=444) DEBUG 04-22 01:38:04 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=443) DEBUG 04-22 01:38:04 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=446) DEBUG 04-22 01:38:04 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=445) DEBUG 04-22 01:38:04 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=443) 2026-04-22 01:38:04,212 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP1 pid=444) 2026-04-22 01:38:04,212 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP3 pid=446) 2026-04-22 01:38:04,212 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP2 pid=445) 2026-04-22 01:38:04,213 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP0 pid=443) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=244) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=244) INFO 04-22 01:38:10 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(Worker_TP2 pid=445) DEBUG 04-22 01:38:10 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=444) DEBUG 04-22 01:38:10 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP3 pid=446) DEBUG 04-22 01:38:10 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=443) DEBUG 04-22 01:38:10 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=244) DEBUG 04-22 01:38:10 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=244) DEBUG 04-22 01:38:10 [v1/engine/core.py:1158] EngineCore waiting for work. +(EngineCore pid=244) DEBUG 04-22 01:38:10 [v1/engine/core.py:1158] EngineCore waiting for work. +(APIServer pid=1) INFO 04-22 01:38:10 [entrypoints/openai/api_server.py:590] Supported tasks: ['generate'] +(APIServer pid=1) WARNING 04-22 01:38:10 [config/model.py:1435] Default vLLM sampling parameters have been overridden by the model's `generation_config.json`: `{'temperature': 0.6, 'top_p': 0.9}`. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`. +(APIServer pid=1) DEBUG 04-22 01:38:10 [renderers/base.py:197] Warming up chat template processing... +(APIServer pid=1) DEBUG 04-22 01:38:10 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e82682-32adea411cb375e87c01dc55;358a11b2-1448-46f2-9841-f4af33f585c7) +(APIServer pid=1) DEBUG 04-22 01:38:10 [transformers_utils/repo_utils.py:243] +(APIServer pid=1) DEBUG 04-22 01:38:10 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/resolve/main/processor_config.json. +(APIServer pid=1) DEBUG 04-22 01:38:10 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e82682-417d35914b7603f52c092bee;5cc6a4a0-42a1-4a0f-ab93-1161ee564144) +(APIServer pid=1) DEBUG 04-22 01:38:10 [transformers_utils/repo_utils.py:243] +(APIServer pid=1) DEBUG 04-22 01:38:10 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/resolve/main/preprocessor_config.json. +(Worker_TP2 pid=445) DEBUG 04-22 01:38:11 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP3 pid=446) DEBUG 04-22 01:38:11 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=444) DEBUG 04-22 01:38:11 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=443) DEBUG 04-22 01:38:11 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(APIServer pid=1) INFO 04-22 01:38:11 [renderers/hf.py:314] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this. +(APIServer pid=1) DEBUG 04-22 01:38:11 [renderers/base.py:203] Chat template warmup completed in 0.842s +(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/openai/api_server.py:594] Starting vLLM server on http://0.0.0.0:8000 +(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:37] Available routes are: +(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /openapi.json, Methods: HEAD, GET +(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /docs, Methods: HEAD, GET +(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /docs/oauth2-redirect, Methods: HEAD, GET +(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /redoc, Methods: HEAD, GET +(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /tokenize, Methods: POST +(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /detokenize, Methods: POST +(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /load, Methods: GET +(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /version, Methods: GET +(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /health, Methods: GET +(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /metrics, Methods: GET +(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /v1/models, Methods: GET +(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /ping, Methods: GET +(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /ping, Methods: POST +(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /invocations, Methods: POST +(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /v1/chat/completions, Methods: POST +(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST +(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /v1/responses, Methods: POST +(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET +(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST +(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /v1/completions, Methods: POST +(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /v1/messages, Methods: POST +(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST +(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /inference/v1/generate, Methods: POST +(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /scale_elastic_ep, Methods: POST +(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST +(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /v1/chat/completions/render, Methods: POST +(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /v1/completions/render, Methods: POST +(APIServer pid=1) INFO: Started server process [1] +(APIServer pid=1) INFO: Waiting for application startup. +(APIServer pid=1) INFO: Application startup complete. +(APIServer pid=1) DEBUG 04-22 01:38:16 [v1/engine/async_llm.py:875] Called check_health. +(APIServer pid=1) INFO: 10.131.2.2:51242 - "GET /health HTTP/1.1" 200 OK diff --git a/accuracy/results/v0.19.0/logs/meta-llama-llama-3-1-8b--h100-80gb--tp1pp1dp1--32768.log b/accuracy/results/v0.19.0/logs/meta-llama-llama-3-1-8b--h100-80gb--tp1pp1dp1--32768.log new file mode 100644 index 00000000..4bfea5d3 --- /dev/null +++ b/accuracy/results/v0.19.0/logs/meta-llama-llama-3-1-8b--h100-80gb--tp1pp1dp1--32768.log @@ -0,0 +1,747 @@ +DEBUG 04-22 01:48:08 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:48:08 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:48:08 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:48:08 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:48:08 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:48:08 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:48:08 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:48:08 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:48:08 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:48:08 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:48:08 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:48:08 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:48:13 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:48:15 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' +DEBUG 04-22 01:48:15 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:48:15 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:48:15 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:48:15 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 01:48:15 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:48:15 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 01:48:15 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 01:48:15 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-3.1-8B-Instruct +(APIServer pid=1) INFO 04-22 01:48:15 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 01:48:15 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:48:15 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-3.1-8B-Instruct', 'model': 'meta-llama/Llama-3.1-8B-Instruct', 'max_model_len': 32768, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 01:48:15 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 01:48:15 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 01:48:15 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003805 secs +(APIServer pid=1) INFO 04-22 01:48:15 [config/model.py:549] Resolved architecture: LlamaForCausalLM +(APIServer pid=1) INFO 04-22 01:48:15 [config/model.py:1678] Using max model len 32768 +(APIServer pid=1) DEBUG 04-22 01:48:15 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 01:48:15 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 01:48:15 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 01:48:15 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 01:48:15 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 01:48:15 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 01:48:15 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 01:48:16 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 01:48:16 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:48:16 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:48:16 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 01:48:20 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:48:20 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:48:20 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:48:20 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:48:20 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:48:20 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:48:20 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:48:20 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:48:20 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:48:20 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:48:20 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:48:20 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:48:24 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=243) DEBUG 04-22 01:48:26 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 01:48:26 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=243) DEBUG 04-22 01:48:26 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/f4ac1597-2d59-40bb-8ed5-6684c2a3c2e8'], outputs=['ipc:///tmp/8aef749a-2647-48a4-b1b3-6af061f134a4'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=243) DEBUG 04-22 01:48:26 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=243) DEBUG 04-22 01:48:26 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=243) DEBUG 04-22 01:48:26 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=243) DEBUG 04-22 01:48:26 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=243) DEBUG 04-22 01:48:26 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=243) INFO 04-22 01:48:26 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=243) DEBUG 04-22 01:48:27 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.38:51655 backend=nccl +(EngineCore pid=243) INFO 04-22 01:48:27 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.38:51655 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=243) DEBUG 04-22 01:48:27 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=243) INFO 04-22 01:48:27 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=243) DEBUG 04-22 01:48:27 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776822507.546496, auto_measure=True +(EngineCore pid=243) DEBUG 04-22 01:48:27 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=243) DEBUG 04-22 01:48:27 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=243) DEBUG 04-22 01:48:27 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=243) DEBUG 04-22 01:48:27 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=243) DEBUG 04-22 01:48:27 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=243) DEBUG 04-22 01:48:27 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=243) DEBUG 04-22 01:48:27 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=243) DEBUG 04-22 01:48:27 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=243) INFO 04-22 01:48:27 [v1/worker/gpu_model_runner.py:4735] Starting to load model meta-llama/Llama-3.1-8B-Instruct... +(EngineCore pid=243) DEBUG 04-22 01:48:28 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=243) INFO 04-22 01:48:28 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=243) INFO 04-22 01:48:28 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=243) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=243) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=243) DEBUG 04-22 01:48:28 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=243) DEBUG 04-22 01:48:28 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=243) DEBUG 04-22 01:48:28 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=243) DEBUG 04-22 01:48:28 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=243) DEBUG 04-22 01:48:28 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00004-of-00004.safetensors', 'model-00002-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'model-00003-of-00004.safetensors']] +(EngineCore pid=243) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 +(APIServer pid=1) DEBUG 04-22 01:48:36 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=243) INFO 04-22 01:48:40 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/0b188b6917/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=b9b4658ca5 comp=e546579c48 code=d4902a9f99bbc2392c45e4b4dc801f4424a11306edb95a61851b1062db20b8c4 dir=/data/.cache/vllm/torch_compile_cache/0b188b6917/rank_0_0/backbone +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] Vllm config hash: b9b4658ca5 +(EngineCore pid=243) INFO 04-22 01:48:40 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.57 s +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=243) DEBUG 04-22 01:48:41 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 01:48:41 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms +(EngineCore pid=243) DEBUG 04-22 01:48:41 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.7 ms +(EngineCore pid=243) DEBUG 04-22 01:48:41 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 01:48:41 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=243) INFO 04-22 01:48:42 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=243) DEBUG 04-22 01:48:42 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/0b188b6917/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=243) DEBUG 04-22 01:48:43 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 01:48:43 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(EngineCore pid=243) DEBUG 04-22 01:48:43 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms +(EngineCore pid=243) DEBUG 04-22 01:48:43 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 01:48:43 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=243) DEBUG 04-22 01:48:44 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/0b188b6917/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=243) DEBUG 04-22 01:48:45 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 01:48:45 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms +(EngineCore pid=243) DEBUG 04-22 01:48:45 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms +(EngineCore pid=243) DEBUG 04-22 01:48:45 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 01:48:45 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(EngineCore pid=243) DEBUG 04-22 01:48:45 [compilation/backends.py:377] Store the 32-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_32', '/data/.cache/vllm/torch_compile_cache/0b188b6917/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_32') +(EngineCore pid=243) INFO 04-22 01:48:45 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.13 s +(EngineCore pid=243) DEBUG 04-22 01:48:45 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/0b188b6917/rank_0_0/backbone/computation_graph.py +(APIServer pid=1) DEBUG 04-22 01:48:46 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=243) INFO 04-22 01:48:46 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/eb43777f3ad942d09a4a824317e86653cb53f8c77161fefd5eaae5f110b38a0a/rank_0_0/model +(EngineCore pid=243) INFO 04-22 01:48:46 [compilation/monitor.py:48] torch.compile took 11.04 s in total +(EngineCore pid=243) INFO 04-22 01:48:47 [compilation/monitor.py:76] Initial profiling/warmup run took 0.45 s +(EngineCore pid=243) INFO 04-22 01:48:52 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=243) DEBUG 04-22 01:48:52 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=243) INFO 04-22 01:48:52 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=243) DEBUG 04-22 01:48:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:48:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:48:53 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 01:48:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:48:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:48:53 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 01:48:53 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 138.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=243) DEBUG 04-22 01:48:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:48:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:48:53 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 01:48:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:48:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:48:53 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 01:48:53 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 260.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=243) DEBUG 04-22 01:48:53 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=243) INFO 04-22 01:48:53 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.84 GiB total +(EngineCore pid=243) DEBUG 04-22 01:48:53 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=243) DEBUG 04-22 01:48:53 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.24 GiB (total), 58.79 GiB (within requested) +(EngineCore pid=243) DEBUG 04-22 01:48:53 [v1/worker/gpu_worker.py:435] Memory profiling takes 17.98 seconds. Total non KV cache memory: 17.12GiB; torch peak memory increase: 1.89GiB; non-torch forward increase memory: 0.25GiB; weights memory: 14.99GiB. +(EngineCore pid=243) INFO 04-22 01:48:53 [v1/worker/gpu_worker.py:436] Available KV cache memory: 58.11 GiB +(EngineCore pid=243) INFO 04-22 01:48:53 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9606 to maintain the same effective KV cache size. +(EngineCore pid=243) INFO 04-22 01:48:53 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 476,016 tokens +(EngineCore pid=243) INFO 04-22 01:48:53 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 32,768 tokens per request: 14.53x +(EngineCore pid=243) 2026-04-22 01:48:53,875 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=243) DEBUG 04-22 01:48:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) 2026-04-22 01:48:53,884 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=243) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:52:29 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:52:29 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 00:52:29 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:52:29 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 00:52:29 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 00:52:29 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model microsoft/phi-4 +(APIServer pid=1) INFO 04-22 00:52:29 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 00:52:29 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:52:29 [entrypoints/utils.py:233] non-default args: {'model_tag': 'microsoft/phi-4', 'model': 'microsoft/phi-4', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 00:52:29 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 00:52:30 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.phi3.Phi3ForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 00:52:30 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0007826 secs +(APIServer pid=1) INFO 04-22 00:52:30 [config/model.py:549] Resolved architecture: Phi3ForCausalLM +(APIServer pid=1) INFO 04-22 00:52:30 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 00:52:30 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 00:52:30 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 00:52:30 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 00:52:30 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 00:52:30 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 00:52:30 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 00:52:30 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 00:52:30 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 00:52:30 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:52:30 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:52:30 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 00:52:34 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:52:34 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:52:34 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:52:34 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:52:34 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:52:34 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:52:34 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:52:34 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:52:34 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:52:34 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:52:34 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:52:34 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:52:39 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(APIServer pid=1) DEBUG 04-22 00:52:40 [v1/engine/utils.py:1042] Waiting for 1 local, 0 remote core engine proc(s) to connect. +(EngineCore pid=244) DEBUG 04-22 00:52:40 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 00:52:40 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=244) DEBUG 04-22 00:52:40 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/12badbc3-9925-4219-a699-30fb51bff4ce'], outputs=['ipc:///tmp/bc7aadab-45d1-4425-969a-6f2a0352f1ba'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=244) DEBUG 04-22 00:52:40 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=244) DEBUG 04-22 00:52:40 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=244) DEBUG 04-22 00:52:40 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=244) DEBUG 04-22 00:52:40 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=244) DEBUG 04-22 00:52:40 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=244) INFO 04-22 00:52:40 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='microsoft/phi-4', speculative_config=None, tokenizer='microsoft/phi-4', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=microsoft/phi-4, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=244) WARNING 04-22 00:52:40 [platforms/interface.py:525] Using 'pin_memory=False' as WSL is detected. This may slow down the performance. +(EngineCore pid=244) DEBUG 04-22 00:52:41 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.131.7.82:45741 backend=nccl +(EngineCore pid=244) INFO 04-22 00:52:41 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.131.7.82:45741 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=244) DEBUG 04-22 00:52:41 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=244) INFO 04-22 00:52:41 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=244) DEBUG 04-22 00:52:41 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776819161.606759, auto_measure=True +(EngineCore pid=244) DEBUG 04-22 00:52:41 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=244) DEBUG 04-22 00:52:41 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=244) DEBUG 04-22 00:52:41 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=244) DEBUG 04-22 00:52:41 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=244) DEBUG 04-22 00:52:41 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=244) DEBUG 04-22 00:52:41 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=244) DEBUG 04-22 00:52:41 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=244) DEBUG 04-22 00:52:41 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=244) INFO 04-22 00:52:41 [v1/worker/gpu_model_runner.py:4735] Starting to load model microsoft/phi-4... +(EngineCore pid=244) DEBUG 04-22 00:52:42 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=244) INFO 04-22 00:52:42 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=244) INFO 04-22 00:52:42 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=244) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=244) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=244) DEBUG 04-22 00:52:42 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=244) DEBUG 04-22 00:52:42 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=244) DEBUG 04-22 00:52:42 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 81, 'silu_and_mul': 40, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=244) DEBUG 04-22 00:52:42 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=244) DEBUG 04-22 00:52:42 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00005-of-00006.safetensors', 'model-00004-of-00006.safetensors', 'model-00002-of-00006.safetensors', 'model-00006-of-00006.safetensors', 'model-00001-of-00006.safetensors', 'model-00003-of-00006.safetensors']] +(EngineCore pid=244) Loading safetensors checkpoint shards: 0% Completed | 0/6 [00:00 +(APIServer pid=1) DEBUG 04-22 00:53:10 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=244) INFO 04-22 00:53:12 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/2556e36a7d/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=c01d08ff9f comp=e546579c48 code=d4902a9f99bbc2392c45e4b4dc801f4424a11306edb95a61851b1062db20b8c4 dir=/data/.cache/vllm/torch_compile_cache/2556e36a7d/rank_0_0/backbone +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] Vllm config hash: c01d08ff9f +(EngineCore pid=244) INFO 04-22 00:53:12 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.97 s +(EngineCore pid=244) DEBUG 04-22 00:53:13 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=244) DEBUG 04-22 00:53:13 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=244) DEBUG 04-22 00:53:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=244) DEBUG 04-22 00:53:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms +(EngineCore pid=244) DEBUG 04-22 00:53:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms +(EngineCore pid=244) DEBUG 04-22 00:53:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=244) DEBUG 04-22 00:53:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=244) INFO 04-22 00:53:15 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=244) DEBUG 04-22 00:53:15 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/2556e36a7d/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=244) DEBUG 04-22 00:53:15 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=244) DEBUG 04-22 00:53:15 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(EngineCore pid=244) DEBUG 04-22 00:53:15 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms +(EngineCore pid=244) DEBUG 04-22 00:53:15 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=244) DEBUG 04-22 00:53:15 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=244) DEBUG 04-22 00:53:17 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/2556e36a7d/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=244) DEBUG 04-22 00:53:18 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=244) DEBUG 04-22 00:53:18 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms +(EngineCore pid=244) DEBUG 04-22 00:53:18 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms +(EngineCore pid=244) DEBUG 04-22 00:53:18 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=244) DEBUG 04-22 00:53:18 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(EngineCore pid=244) DEBUG 04-22 00:53:19 [compilation/backends.py:377] Store the 40-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_40', '/data/.cache/vllm/torch_compile_cache/2556e36a7d/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_40') +(EngineCore pid=244) INFO 04-22 00:53:19 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 6.27 s +(EngineCore pid=244) DEBUG 04-22 00:53:19 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/2556e36a7d/rank_0_0/backbone/computation_graph.py +(EngineCore pid=244) INFO 04-22 00:53:20 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/e77a412f263c8a32560bf00ea90369dc5c974de159e6eb0e5eb243025ca9ebe4/rank_0_0/model +(EngineCore pid=244) INFO 04-22 00:53:20 [compilation/monitor.py:48] torch.compile took 12.51 s in total +(APIServer pid=1) DEBUG 04-22 00:53:20 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=244) INFO 04-22 00:53:21 [compilation/monitor.py:76] Initial profiling/warmup run took 0.70 s +(EngineCore pid=244) INFO 04-22 00:53:26 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=244) DEBUG 04-22 00:53:26 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=244) INFO 04-22 00:53:26 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=244) DEBUG 04-22 00:53:26 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:53:26 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:53:26 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-22 00:53:26 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:53:26 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:53:26 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-22 00:53:26 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 156.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=244) DEBUG 04-22 00:53:26 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:53:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:53:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-22 00:53:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:53:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:53:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-22 00:53:27 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 328.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(EngineCore pid=244) DEBUG 04-22 00:53:27 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=244) INFO 04-22 00:53:27 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.81 GiB total +(EngineCore pid=244) DEBUG 04-22 00:53:27 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=244) DEBUG 04-22 00:53:27 [v1/worker/gpu_worker.py:430] Free memory after profiling: 50.78 GiB (total), 47.33 GiB (within requested) +(EngineCore pid=244) DEBUG 04-22 00:53:27 [v1/worker/gpu_worker.py:435] Memory profiling takes 19.87 seconds. Total non KV cache memory: 29.15GiB; torch peak memory increase: 1.52GiB; non-torch forward increase memory: 0.25GiB; weights memory: 27.39GiB. +(EngineCore pid=244) INFO 04-22 00:53:27 [v1/worker/gpu_worker.py:436] Available KV cache memory: 46.08 GiB +(EngineCore pid=244) INFO 04-22 00:53:27 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9602 to maintain the same effective KV cache size. +(EngineCore pid=244) INFO 04-22 00:53:27 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 241,568 tokens +(EngineCore pid=244) INFO 04-22 00:53:27 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 29.49x +(EngineCore pid=244) 2026-04-22 00:53:27,859 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=244) DEBUG 04-22 00:53:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) 2026-04-22 00:53:27,872 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=244) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:53:55 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:53:55 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 00:53:55 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:53:55 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 00:53:55 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 00:53:55 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model mistralai/Mistral-Small-3.1-24B-Instruct-2503 +(APIServer pid=1) INFO 04-22 00:53:55 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 00:53:55 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:53:55 [entrypoints/utils.py:233] non-default args: {'model_tag': 'mistralai/Mistral-Small-3.1-24B-Instruct-2503', 'model': 'mistralai/Mistral-Small-3.1-24B-Instruct-2503', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 00:53:55 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) INFO 04-22 00:53:55 [transformers_utils/config.py:288] Inferred from consolidated*.safetensors files torch.bfloat16 dtype. +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] Initialized config PretrainedConfig { +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "architectures": [ +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "PixtralForConditionalGeneration" +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] ], +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "dtype": "bfloat16", +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "image_token_index": 10, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "multimodal_projector_bias": false, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "projector_hidden_act": "gelu", +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "spatial_merge_size": 2, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "text_config": { +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "_name_or_path": "", +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "add_cross_attention": false, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "architectures": [ +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "MistralForCausalLM" +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] ], +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "bad_words_ids": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "begin_suppress_tokens": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "bos_token_id": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "chunk_size_feed_forward": 0, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "cross_attention_hidden_size": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "decoder_start_token_id": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "diversity_penalty": 0.0, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "do_sample": false, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "dtype": "bfloat16", +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "early_stopping": false, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "encoder_no_repeat_ngram_size": 0, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "eos_token_id": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "exponential_decay_length_penalty": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "finetuning_task": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "forced_bos_token_id": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "forced_eos_token_id": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "head_dim": 128, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "hidden_act": "silu", +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "hidden_size": 5120, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "id2label": { +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "0": "LABEL_0", +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "1": "LABEL_1" +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] }, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "intermediate_size": 32768, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "is_decoder": false, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "is_encoder_decoder": false, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "label2id": { +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "LABEL_0": 0, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "LABEL_1": 1 +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] }, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "length_penalty": 1.0, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "max_length": 20, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "max_position_embeddings": 131072, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "max_seq_len": 131072, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "min_length": 0, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "model_type": "", +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "no_repeat_ngram_size": 0, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "num_attention_heads": 32, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "num_beam_groups": 1, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "num_beams": 1, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "num_hidden_layers": 40, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "num_key_value_heads": 8, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "num_return_sequences": 1, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "output_attentions": false, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "output_hidden_states": false, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "output_scores": false, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "pad_token_id": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "prefix": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "problem_type": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "pruned_heads": {}, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "remove_invalid_values": false, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "repetition_penalty": 1.0, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "return_dict": true, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "return_dict_in_generate": false, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "rms_norm_eps": 1e-05, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "rope_theta": 1000000000.0, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "sep_token_id": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "suppress_tokens": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "task_specific_params": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "temperature": 1.0, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "tf_legacy_loss": false, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "tie_encoder_decoder": false, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "tie_word_embeddings": false, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "tokenizer_class": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "top_k": 50, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "top_p": 1.0, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "torchscript": false, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "typical_p": 1.0, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "use_bfloat16": false, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "vocab_size": 131072 +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] }, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "transformers_version": "4.57.6", +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "vision_config": { +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "_name_or_path": "", +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "adapter_bias": false, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "add_cross_attention": false, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "add_pre_mm_projector_layer_norm": true, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "architectures": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "bad_words_ids": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "begin_suppress_tokens": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "bos_token_id": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "chunk_size_feed_forward": 0, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "cross_attention_hidden_size": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "decoder_start_token_id": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "diversity_penalty": 0.0, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "do_sample": false, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "dtype": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "early_stopping": false, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "encoder_no_repeat_ngram_size": 0, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "eos_token_id": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "exponential_decay_length_penalty": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "finetuning_task": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "forced_bos_token_id": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "forced_eos_token_id": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "hidden_size": 1024, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "id2label": { +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "0": "LABEL_0", +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "1": "LABEL_1" +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] }, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "image_break_token_id": 12, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "image_end_token_id": 13, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "image_size": 1540, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "image_token_id": 10, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "intermediate_size": 4096, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "is_decoder": false, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "is_encoder_decoder": false, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "label2id": { +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "LABEL_0": 0, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "LABEL_1": 1 +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] }, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "length_penalty": 1.0, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "max_image_size": 1540, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "max_length": 20, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "min_length": 0, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "mm_projector_id": "patch_merge", +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "model_type": "", +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "no_repeat_ngram_size": 0, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "num_attention_heads": 16, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "num_beam_groups": 1, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "num_beams": 1, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "num_channels": 3, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "num_hidden_layers": 24, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "num_return_sequences": 1, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "output_attentions": false, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "output_hidden_states": false, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "output_scores": false, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "pad_token_id": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "patch_size": 14, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "prefix": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "problem_type": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "pruned_heads": {}, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "remove_invalid_values": false, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "repetition_penalty": 1.0, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "return_dict": true, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "return_dict_in_generate": false, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "rope_theta": 10000.0, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "sep_token_id": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "spatial_merge_size": 2, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "suppress_tokens": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "task_specific_params": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "temperature": 1.0, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "tf_legacy_loss": false, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "tie_encoder_decoder": false, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "tie_word_embeddings": true, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "tokenizer_class": null, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "top_k": 50, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "top_p": 1.0, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "torchscript": false, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "typical_p": 1.0, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "use_bfloat16": false +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] }, +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "vision_feature_layer": -1 +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] } +(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] +(APIServer pid=1) DEBUG 04-22 00:53:56 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.pixtral.PixtralForConditionalGeneration from cache +(APIServer pid=1) DEBUG 04-22 00:53:56 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0011260 secs +(APIServer pid=1) INFO 04-22 00:53:56 [config/model.py:549] Resolved architecture: PixtralForConditionalGeneration +(APIServer pid=1) INFO 04-22 00:53:56 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 00:53:56 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 00:53:56 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 00:53:56 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 00:53:56 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 00:53:56 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 00:53:56 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 00:53:56 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 00:53:56 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 00:53:56 [tokenizers/registry.py:68] Loading MistralTokenizer for tokenizer_mode='mistral' +(APIServer pid=1) DEBUG 04-22 00:53:56 [renderers/registry.py:57] Loading MistralRenderer for renderer_mode='mistral' +(APIServer pid=1) DEBUG 04-22 00:53:57 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. +(APIServer pid=1) DEBUG 04-22 00:53:57 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 00:54:01 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:54:01 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:54:01 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:54:01 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:54:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:54:01 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:54:01 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:54:01 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:54:01 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:54:01 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:54:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:54:01 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:54:06 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=243) DEBUG 04-22 00:54:07 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 00:54:07 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=243) DEBUG 04-22 00:54:07 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/c09e0e00-f7cd-4413-841f-2dc61caf765d'], outputs=['ipc:///tmp/44f8ab81-724b-42a3-8f49-3523a3c4a6fb'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=243) DEBUG 04-22 00:54:07 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=243) DEBUG 04-22 00:54:07 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=243) DEBUG 04-22 00:54:07 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=243) DEBUG 04-22 00:54:07 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=243) DEBUG 04-22 00:54:07 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=243) INFO 04-22 00:54:07 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='mistralai/Mistral-Small-3.1-24B-Instruct-2503', speculative_config=None, tokenizer='mistralai/Mistral-Small-3.1-24B-Instruct-2503', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=mistralai/Mistral-Small-3.1-24B-Instruct-2503, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=243) DEBUG 04-22 00:54:08 [tokenizers/registry.py:68] Loading MistralTokenizer for tokenizer_mode='mistral' +(EngineCore pid=243) DEBUG 04-22 00:54:09 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.7.33:45449 backend=nccl +(EngineCore pid=243) INFO 04-22 00:54:09 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.7.33:45449 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=243) DEBUG 04-22 00:54:09 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=243) INFO 04-22 00:54:09 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=243) DEBUG 04-22 00:54:09 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776819249.5632694, auto_measure=True +(EngineCore pid=243) DEBUG 04-22 00:54:09 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=243) DEBUG 04-22 00:54:09 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=243) DEBUG 04-22 00:54:09 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=243) DEBUG 04-22 00:54:09 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=243) DEBUG 04-22 00:54:09 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=243) DEBUG 04-22 00:54:09 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=243) DEBUG 04-22 00:54:09 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=243) DEBUG 04-22 00:54:09 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. +(EngineCore pid=243) DEBUG 04-22 00:54:09 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=243) INFO 04-22 00:54:09 [v1/worker/gpu_model_runner.py:4735] Starting to load model mistralai/Mistral-Small-3.1-24B-Instruct-2503... +(EngineCore pid=243) INFO 04-22 00:54:10 [config/vllm.py:790] Asynchronous scheduling is enabled. +(EngineCore pid=243) DEBUG 04-22 00:54:10 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds', 't_cond'] +(EngineCore pid=243) DEBUG 04-22 00:54:10 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=243) INFO 04-22 00:54:10 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=243) INFO 04-22 00:54:10 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=243) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=243) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=243) DEBUG 04-22 00:54:10 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=243) DEBUG 04-22 00:54:10 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=243) DEBUG 04-22 00:54:10 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=243) DEBUG 04-22 00:54:10 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 81, 'silu_and_mul': 40, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=243) DEBUG 04-22 00:54:10 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=243) DEBUG 04-22 00:54:10 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 131, 'silu_and_mul': 40, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1, 'conv2d': 1}) +(EngineCore pid=243) DEBUG 04-22 00:54:10 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=243) DEBUG 04-22 00:54:10 [model_executor/model_loader/weight_utils.py:557] Using model weights format ['consolidated*.safetensors', '*.pt'] +(EngineCore pid=243) INFO 04-22 00:54:11 [model_executor/model_loader/weight_utils.py:625] No consolidated.safetensors.index.json found in remote. +(EngineCore pid=243) Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00 +(APIServer pid=1) DEBUG 04-22 00:54:57 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/mistral.py +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=243) INFO 04-22 00:55:02 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/d521e0d6c8/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=d4f29d6b5e comp=e546579c48 code=3efb0f576ccfea9fc4cca5c687c92f9011bee8476a24c13d13c993a50f3eaf70 dir=/data/.cache/vllm/torch_compile_cache/d521e0d6c8/rank_0_0/backbone +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] Vllm config hash: d4f29d6b5e +(EngineCore pid=243) INFO 04-22 00:55:02 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.85 s +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=243) DEBUG 04-22 00:55:03 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 00:55:03 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms +(EngineCore pid=243) DEBUG 04-22 00:55:03 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.7 ms +(EngineCore pid=243) DEBUG 04-22 00:55:03 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 00:55:03 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=243) INFO 04-22 00:55:05 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=243) DEBUG 04-22 00:55:05 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/d521e0d6c8/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=243) DEBUG 04-22 00:55:05 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 00:55:05 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(EngineCore pid=243) DEBUG 04-22 00:55:05 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms +(EngineCore pid=243) DEBUG 04-22 00:55:05 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 00:55:05 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(EngineCore pid=243) DEBUG 04-22 00:55:06 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/d521e0d6c8/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=243) DEBUG 04-22 00:55:07 [compilation/backends.py:377] Store the 2-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_2', '/data/.cache/vllm/torch_compile_cache/d521e0d6c8/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_2') +(APIServer pid=1) DEBUG 04-22 00:55:07 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=243) DEBUG 04-22 00:55:08 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 00:55:08 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms +(EngineCore pid=243) DEBUG 04-22 00:55:08 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms +(EngineCore pid=243) DEBUG 04-22 00:55:08 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 00:55:08 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(EngineCore pid=243) DEBUG 04-22 00:55:09 [compilation/backends.py:377] Store the 40-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_40', '/data/.cache/vllm/torch_compile_cache/d521e0d6c8/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_40') +(EngineCore pid=243) INFO 04-22 00:55:09 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 6.55 s +(EngineCore pid=243) DEBUG 04-22 00:55:09 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/d521e0d6c8/rank_0_0/backbone/computation_graph.py +(EngineCore pid=243) INFO 04-22 00:55:10 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/98ced68e0a54c070de5d01fdcea3f926c3687257f6bcfb752981b49fdfe92890/rank_0_0/model +(EngineCore pid=243) INFO 04-22 00:55:10 [compilation/monitor.py:48] torch.compile took 12.68 s in total +(EngineCore pid=243) INFO 04-22 00:55:11 [compilation/monitor.py:76] Initial profiling/warmup run took 0.90 s +(EngineCore pid=243) INFO 04-22 00:55:16 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=243) DEBUG 04-22 00:55:16 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=243) INFO 04-22 00:55:16 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=243) DEBUG 04-22 00:55:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:55:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:55:16 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 00:55:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:55:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:55:16 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 00:55:16 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 212.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=243) DEBUG 04-22 00:55:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:55:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:55:16 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 00:55:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:55:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:55:16 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 00:55:16 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 262.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=243) DEBUG 04-22 00:55:17 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=243) INFO 04-22 00:55:17 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.84 GiB total +(APIServer pid=1) DEBUG 04-22 00:55:17 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=243) DEBUG 04-22 00:55:17 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=243) DEBUG 04-22 00:55:17 [v1/worker/gpu_worker.py:430] Free memory after profiling: 33.42 GiB (total), 29.98 GiB (within requested) +(EngineCore pid=243) DEBUG 04-22 00:55:17 [v1/worker/gpu_worker.py:435] Memory profiling takes 21.11 seconds. Total non KV cache memory: 47.04GiB; torch peak memory increase: 2.03GiB; non-torch forward increase memory: 0.25GiB; weights memory: 44.76GiB. +(EngineCore pid=243) INFO 04-22 00:55:17 [v1/worker/gpu_worker.py:436] Available KV cache memory: 28.19 GiB +(EngineCore pid=243) INFO 04-22 00:55:17 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9606 to maintain the same effective KV cache size. +(EngineCore pid=243) INFO 04-22 00:55:17 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 184,752 tokens +(EngineCore pid=243) INFO 04-22 00:55:17 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 22.55x +(EngineCore pid=243) 2026-04-22 00:55:17,728 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=243) DEBUG 04-22 00:55:17 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) 2026-04-22 00:55:17,742 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=243) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:55:53 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:55:53 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 00:55:54 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:55:54 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 00:55:54 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 00:55:54 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model mistralai/Mixtral-8x7B-Instruct-v0.1 +(APIServer pid=1) INFO 04-22 00:55:54 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 00:55:54 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:55:54 [entrypoints/utils.py:233] non-default args: {'model_tag': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'model': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'max_model_len': 8192, 'tensor_parallel_size': 2, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 00:55:54 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 00:55:54 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.mixtral.MixtralForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 00:55:54 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0015548 secs +(APIServer pid=1) INFO 04-22 00:55:54 [config/model.py:549] Resolved architecture: MixtralForCausalLM +(APIServer pid=1) INFO 04-22 00:55:54 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 00:55:54 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 00:55:54 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 00:55:54 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 00:55:54 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 00:55:54 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 00:55:54 [config/parallel.py:743] Defaulting to use mp for distributed inference +(APIServer pid=1) INFO 04-22 00:55:54 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 00:55:54 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(APIServer pid=1) DEBUG 04-22 00:55:55 [config/vllm.py:1530] Max num batched tokens below allreduce-rms fusion threshold, allreduce-rms fusion will be enabled for all num_tokens. +(APIServer pid=1) INFO 04-22 00:55:55 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(APIServer pid=1) DEBUG 04-22 00:55:55 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 00:55:55 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:55:56 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:55:56 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 00:55:59 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:55:59 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:55:59 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:55:59 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:55:59 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:55:59 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:55:59 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:55:59 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:55:59 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:55:59 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:55:59 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:55:59 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:56:04 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=244) DEBUG 04-22 00:56:05 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 00:56:05 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=244) DEBUG 04-22 00:56:05 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/899f8561-87fc-4033-8fbd-da1739ae100a'], outputs=['ipc:///tmp/bb7b91ab-7c47-4747-b3f9-be76f2bb676b'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=244) DEBUG 04-22 00:56:05 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=244) DEBUG 04-22 00:56:05 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=244) DEBUG 04-22 00:56:05 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=244) DEBUG 04-22 00:56:05 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=244) DEBUG 04-22 00:56:05 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=244) INFO 04-22 00:56:05 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='mistralai/Mixtral-8x7B-Instruct-v0.1', speculative_config=None, tokenizer='mistralai/Mixtral-8x7B-Instruct-v0.1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=mistralai/Mixtral-8x7B-Instruct-v0.1, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=244) WARNING 04-22 00:56:05 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. +(EngineCore pid=244) INFO 04-22 00:56:05 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.131.7.83 (local), world_size=2, local_world_size=2 +(EngineCore pid=244) DEBUG 04-22 00:56:05 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/750eb9b3-2997-4601-a967-1c5de9df277c +(EngineCore pid=244) DEBUG 04-22 00:56:05 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1], buffer_handle=(2, 16777216, 10, 'psm_18257a27'), local_subscribe_addr='ipc:///tmp/750eb9b3-2997-4601-a967-1c5de9df277c', local_notify_addr='ipc:///tmp/0eda70c6-3b93-4953-8320-cb4bfddf9d8c', remote_subscribe_addr=None, remote_addr_ipv6=False) +DEBUG 04-22 00:56:09 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:56:09 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:56:09 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:56:09 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:56:09 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:56:09 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:56:09 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:56:09 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:56:09 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:56:09 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:56:09 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:56:09 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:56:09 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:56:09 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:56:09 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:56:09 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:56:09 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:56:09 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:56:09 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:56:09 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:56:09 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:56:09 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:56:09 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:56:09 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:56:14 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 00:56:14 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 00:56:15 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 00:56:15 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:56:15 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:56:15 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 00:56:15 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 00:56:15 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:56:15 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:56:15 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) DEBUG 04-22 00:56:15 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker pid=443) DEBUG 04-22 00:56:15 [distributed/parallel_state.py:1356] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:38677 backend=nccl +(Worker pid=443) INFO 04-22 00:56:15 [distributed/parallel_state.py:1400] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:38677 backend=nccl +(Worker pid=444) DEBUG 04-22 00:56:16 [distributed/parallel_state.py:1356] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:38677 backend=nccl +(Worker pid=444) INFO 04-22 00:56:16 [distributed/parallel_state.py:1400] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:38677 backend=nccl +[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +(Worker pid=444) DEBUG 04-22 00:56:16 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=443) DEBUG 04-22 00:56:16 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +(Worker pid=444) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=443) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=444) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=443) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=443) DEBUG 04-22 00:56:17 [utils/nccl.py:34] Found nccl from library libnccl.so.2 +(Worker pid=443) INFO 04-22 00:56:17 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 +(Worker pid=444) DEBUG 04-22 00:56:17 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=443) DEBUG 04-22 00:56:17 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=443) DEBUG 04-22 00:56:17 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/5ae62e70-3439-4050-81ce-b9208af6f8b5 +(Worker pid=443) DEBUG 04-22 00:56:17 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1], buffer_handle=(1, 4194304, 6, 'psm_5b23863a'), local_subscribe_addr='ipc:///tmp/5ae62e70-3439-4050-81ce-b9208af6f8b5', local_notify_addr='ipc:///tmp/99a778fb-5da6-4237-b33b-9bff59fd1ef3', remote_subscribe_addr=None, remote_addr_ipv6=False) +(Worker pid=444) DEBUG 04-22 00:56:17 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/5ae62e70-3439-4050-81ce-b9208af6f8b5 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +(Worker pid=443) INFO 04-22 00:56:18 [distributed/parallel_state.py:1716] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N/A +(Worker pid=444) DEBUG 04-22 00:56:18 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.18GiB, total_memory=79.19GiB, cuda_memory=2.01GiB, torch_memory=0.02GiB, non_torch_memory=1.99GiB, timestamp=1776819378.4357853, auto_measure=True +(Worker pid=444) DEBUG 04-22 00:56:18 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=443) DEBUG 04-22 00:56:18 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.18GiB, total_memory=79.19GiB, cuda_memory=2.01GiB, torch_memory=0.02GiB, non_torch_memory=1.99GiB, timestamp=1776819378.438148, auto_measure=True +(Worker pid=443) DEBUG 04-22 00:56:18 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=443) DEBUG 04-22 00:56:18 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=443) DEBUG 04-22 00:56:18 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=444) DEBUG 04-22 00:56:18 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=444) DEBUG 04-22 00:56:18 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=443) DEBUG 04-22 00:56:18 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=444) DEBUG 04-22 00:56:18 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=443) DEBUG 04-22 00:56:18 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=444) DEBUG 04-22 00:56:18 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=443) DEBUG 04-22 00:56:18 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(Worker pid=443) DEBUG 04-22 00:56:18 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=444) DEBUG 04-22 00:56:18 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=443) DEBUG 04-22 00:56:18 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(Worker pid=443) DEBUG 04-22 00:56:18 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker_TP0 pid=443) INFO 04-22 00:56:18 [v1/worker/gpu_model_runner.py:4735] Starting to load model mistralai/Mixtral-8x7B-Instruct-v0.1... +(Worker_TP1 pid=444) DEBUG 04-22 00:56:18 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker_TP0 pid=443) DEBUG 04-22 00:56:18 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(Worker_TP0 pid=443) INFO 04-22 00:56:18 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(Worker_TP0 pid=443) INFO 04-22 00:56:18 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(Worker_TP0 pid=443) INFO 04-22 00:56:18 [model_executor/.../oracle/unquantized.py:186] Using TRITON backend for Unquantized MoE +(Worker_TP0 pid=443) DEBUG 04-22 00:56:18 [model_executor/.../runner/default_moe_runner.py:240] Enabled separate cuda stream for MoE shared_experts +(Worker_TP0 pid=443) DEBUG 04-22 00:56:19 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP0 pid=443) DEBUG 04-22 00:56:19 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP0 pid=443) DEBUG 04-22 00:56:19 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'fused_moe': 32, 'unquantized_fused_moe': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP0 pid=443) DEBUG 04-22 00:56:19 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP1 pid=444) DEBUG 04-22 00:56:19 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP1 pid=444) DEBUG 04-22 00:56:19 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP1 pid=444) DEBUG 04-22 00:56:19 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'fused_moe': 32, 'unquantized_fused_moe': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP1 pid=444) DEBUG 04-22 00:56:19 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP1 pid=444) DEBUG 04-22 00:56:19 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00018-of-00019.safetensors', 'model-00008-of-00019.safetensors', 'model-00009-of-00019.safetensors', 'model-00017-of-00019.safetensors', 'model-00007-of-00019.safetensors', 'model-00005-of-00019.safetensors', 'model-00002-of-00019.safetensors', 'model-00010-of-00019.safetensors', 'model-00003-of-00019.safetensors', 'model-00012-of-00019.safetensors', 'model-00016-of-00019.safetensors', 'model-00011-of-00019.safetensors', 'model-00014-of-00019.safetensors', 'model-00004-of-00019.safetensors', 'model-00006-of-00019.safetensors', 'model-00015-of-00019.safetensors', 'model-00001-of-00019.safetensors', 'model-00019-of-00019.safetensors', 'model-00013-of-00019.safetensors']] +(Worker_TP0 pid=443) DEBUG 04-22 00:56:19 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00010-of-00019.safetensors', 'model-00004-of-00019.safetensors', 'model-00001-of-00019.safetensors', 'model-00016-of-00019.safetensors', 'model-00008-of-00019.safetensors', 'model-00012-of-00019.safetensors', 'model-00019-of-00019.safetensors', 'model-00003-of-00019.safetensors', 'model-00006-of-00019.safetensors', 'model-00018-of-00019.safetensors', 'model-00015-of-00019.safetensors', 'model-00013-of-00019.safetensors', 'model-00009-of-00019.safetensors', 'model-00011-of-00019.safetensors', 'model-00014-of-00019.safetensors', 'model-00007-of-00019.safetensors', 'model-00002-of-00019.safetensors', 'model-00017-of-00019.safetensors', 'model-00005-of-00019.safetensors']] +(Worker_TP0 pid=443) Loading safetensors checkpoint shards: 0% Completed | 0/19 [00:00 +(Worker_TP1 pid=444) DEBUG 04-22 00:57:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=444) DEBUG 04-22 00:57:25 [compilation/decorators.py:528] Start compiling function +(APIServer pid=1) DEBUG 04-22 00:57:25 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=244) DEBUG 04-22 00:57:26 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/forward_context.py +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/config.py +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/mixtral.py +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/forward_context.py +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/config.py +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/mixtral.py +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=cbf0405b61 comp=e546579c48 code=25313a6b8ebc50305714b2e0cfd04eb322a52ea698903c8ad5f901c77d4999c2 dir=/data/.cache/vllm/torch_compile_cache/6f8993cba3/rank_1_0/backbone +(Worker_TP0 pid=443) INFO 04-22 00:57:29 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/6f8993cba3/rank_0_0/backbone for vLLM's torch.compile +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=cbf0405b61 comp=e546579c48 code=25313a6b8ebc50305714b2e0cfd04eb322a52ea698903c8ad5f901c77d4999c2 dir=/data/.cache/vllm/torch_compile_cache/6f8993cba3/rank_0_0/backbone +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] Vllm config hash: cbf0405b61 +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] Vllm config hash: cbf0405b61 +(Worker_TP0 pid=443) INFO 04-22 00:57:29 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.32 s +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/.../fusion/allreduce_rms_fusion.py:765] Flashinfer max size: 64 MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: 8192 +(Worker_TP0 pid=443) INFO 04-22 00:57:29 [distributed/device_communicators/flashinfer_all_reduce.py:109] Auto-selected flashinfer allreduce backend: trtllm +(Worker_TP0 pid=443) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. +(Worker_TP0 pid=443) return func(*args, **kwargs) +(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=0, max_token_num=8192, hidden_dim=4096, dtype=torch.bfloat16 +(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=1, max_token_num=8192, hidden_dim=4096, dtype=torch.bfloat16 +(Worker_TP0 pid=443) INFO 04-22 00:57:29 [distributed/device_communicators/flashinfer_all_reduce.py:149] Initialized FlashInfer Allreduce norm fusion workspace with backend=trtllm +(Worker_TP0 pid=443) DEBUG 04-22 00:57:30 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(Worker_TP0 pid=443) DEBUG 04-22 00:57:30 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(Worker_TP0 pid=443) DEBUG 04-22 00:57:30 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=443) DEBUG 04-22 00:57:30 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP1 pid=444) DEBUG 04-22 00:57:30 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=444) DEBUG 04-22 00:57:30 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP0 pid=443) DEBUG 04-22 00:57:30 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns +(Worker_TP0 pid=443) DEBUG 04-22 00:57:30 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 23.5 ms +(Worker_TP0 pid=443) DEBUG 04-22 00:57:30 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=443) DEBUG 04-22 00:57:30 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes +(Worker_TP0 pid=443) DEBUG 04-22 00:57:30 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP1 pid=444) DEBUG 04-22 00:57:30 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns +(Worker_TP1 pid=444) DEBUG 04-22 00:57:30 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 23.7 ms +(Worker_TP1 pid=444) DEBUG 04-22 00:57:30 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=444) DEBUG 04-22 00:57:30 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes +(Worker_TP1 pid=444) DEBUG 04-22 00:57:30 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=443) INFO 04-22 00:57:33 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(Worker_TP0 pid=443) DEBUG 04-22 00:57:33 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/6f8993cba3/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(Worker_TP0 pid=443) DEBUG 04-22 00:57:33 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=443) DEBUG 04-22 00:57:33 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP0 pid=443) DEBUG 04-22 00:57:33 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP0 pid=443) DEBUG 04-22 00:57:33 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 36.3 ms +(Worker_TP0 pid=443) DEBUG 04-22 00:57:33 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP0 pid=443) DEBUG 04-22 00:57:33 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP0 pid=443) DEBUG 04-22 00:57:33 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP1 pid=444) DEBUG 04-22 00:57:33 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=444) DEBUG 04-22 00:57:33 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP1 pid=444) DEBUG 04-22 00:57:33 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP1 pid=444) DEBUG 04-22 00:57:33 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 35.8 ms +(Worker_TP1 pid=444) DEBUG 04-22 00:57:33 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP1 pid=444) DEBUG 04-22 00:57:33 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP1 pid=444) DEBUG 04-22 00:57:33 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP0 pid=443) DEBUG 04-22 00:57:34 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/6f8993cba3/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(Worker_TP0 pid=443) DEBUG 04-22 00:57:35 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=443) DEBUG 04-22 00:57:35 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(Worker_TP0 pid=443) DEBUG 04-22 00:57:35 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP0 pid=443) DEBUG 04-22 00:57:35 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 36.8 ms +(Worker_TP0 pid=443) DEBUG 04-22 00:57:35 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms +(Worker_TP0 pid=443) DEBUG 04-22 00:57:35 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP0 pid=443) DEBUG 04-22 00:57:35 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP0 pid=443) DEBUG 04-22 00:57:35 [compilation/backends.py:377] Store the 32-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_32', '/data/.cache/vllm/torch_compile_cache/6f8993cba3/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_32') +(Worker_TP0 pid=443) INFO 04-22 00:57:35 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.02 s +(Worker_TP0 pid=443) DEBUG 04-22 00:57:35 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/6f8993cba3/rank_0_0/backbone/computation_graph.py +(Worker_TP1 pid=444) DEBUG 04-22 00:57:35 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=444) DEBUG 04-22 00:57:35 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(Worker_TP1 pid=444) DEBUG 04-22 00:57:35 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP1 pid=444) DEBUG 04-22 00:57:35 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 36.9 ms +(Worker_TP1 pid=444) DEBUG 04-22 00:57:35 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms +(Worker_TP1 pid=444) DEBUG 04-22 00:57:35 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP1 pid=444) DEBUG 04-22 00:57:35 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(APIServer pid=1) DEBUG 04-22 00:57:35 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=443) INFO 04-22 00:57:36 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/de1a3d5a254a6970995e4bf5ae425faa50662dbbc60a8657443a824f90c07d5c/rank_0_0/model +(Worker_TP0 pid=443) INFO 04-22 00:57:36 [compilation/monitor.py:48] torch.compile took 11.13 s in total +(Worker_TP0 pid=443) INFO 04-22 00:57:36 [model_executor/.../fused_moe/fused_moe.py:1077] Using configuration from /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json for MoE layer. +(Worker_TP0 pid=443) INFO 04-22 00:57:37 [compilation/monitor.py:76] Initial profiling/warmup run took 1.40 s +(Worker_TP0 pid=443) INFO 04-22 00:57:43 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP0 pid=443) DEBUG 04-22 00:57:43 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP0 pid=443) INFO 04-22 00:57:43 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_TP1 pid=444) INFO 04-22 00:57:43 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP1 pid=444) DEBUG 04-22 00:57:43 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP1 pid=444) INFO 04-22 00:57:43 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_TP0 pid=443) DEBUG 04-22 00:57:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=444) DEBUG 04-22 00:57:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=443) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=443) DEBUG 04-22 00:57:44 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=444) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=444) DEBUG 04-22 00:57:44 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=443) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=444) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=443) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=444) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=443) DEBUG 04-22 00:57:44 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=444) DEBUG 04-22 00:57:44 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=444) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 84.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(Worker_TP1 pid=444) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=443) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 84.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(Worker_TP0 pid=443) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=444) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=444) DEBUG 04-22 00:57:44 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=443) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=443) DEBUG 04-22 00:57:44 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=444) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=443) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=444) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=444) DEBUG 04-22 00:57:44 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=443) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=443) DEBUG 04-22 00:57:44 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=444) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 136.00 MiB first-capture + (51-1) × 8.00 MiB per-graph +(Worker_TP1 pid=444) INFO 04-22 00:57:44 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses +(Worker_TP0 pid=443) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 136.00 MiB first-capture + (51-1) × 8.00 MiB per-graph +(Worker_TP0 pid=443) INFO 04-22 00:57:44 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses +(Worker_TP0 pid=443) DEBUG 04-22 00:57:45 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP0 pid=443) INFO 04-22 00:57:45 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.82 GiB total +(Worker_TP1 pid=444) DEBUG 04-22 00:57:45 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP1 pid=444) INFO 04-22 00:57:45 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.82 GiB total +(APIServer pid=1) DEBUG 04-22 00:57:45 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=443) DEBUG 04-22 00:57:45 [v1/worker/gpu_worker.py:424] Initial free memory: 77.18 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP0 pid=443) DEBUG 04-22 00:57:45 [v1/worker/gpu_worker.py:430] Free memory after profiling: 30.69 GiB (total), 28.73 GiB (within requested) +(Worker_TP0 pid=443) DEBUG 04-22 00:57:45 [v1/worker/gpu_worker.py:435] Memory profiling takes 20.58 seconds. Total non KV cache memory: 46.79GiB; torch peak memory increase: 1.21GiB; non-torch forward increase memory: 2.07GiB; weights memory: 43.51GiB. +(Worker_TP0 pid=443) INFO 04-22 00:57:45 [v1/worker/gpu_worker.py:436] Available KV cache memory: 28.44 GiB +(Worker_TP0 pid=443) INFO 04-22 00:57:45 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9603 to maintain the same effective KV cache size. +(Worker_TP0 pid=443) DEBUG 04-22 00:57:45 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=244) DEBUG 04-22 00:57:45 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=244) DEBUG 04-22 00:57:45 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=444) DEBUG 04-22 00:57:45 [v1/worker/gpu_worker.py:424] Initial free memory: 77.18 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP1 pid=444) DEBUG 04-22 00:57:45 [v1/worker/gpu_worker.py:430] Free memory after profiling: 30.69 GiB (total), 28.73 GiB (within requested) +(Worker_TP1 pid=444) DEBUG 04-22 00:57:45 [v1/worker/gpu_worker.py:435] Memory profiling takes 20.55 seconds. Total non KV cache memory: 46.79GiB; torch peak memory increase: 1.21GiB; non-torch forward increase memory: 2.07GiB; weights memory: 43.51GiB. +(Worker_TP1 pid=444) INFO 04-22 00:57:45 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9603 to maintain the same effective KV cache size. +(Worker_TP1 pid=444) DEBUG 04-22 00:57:45 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=244) DEBUG 04-22 00:57:45 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=244) INFO 04-22 00:57:45 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 465,936 tokens +(EngineCore pid=244) INFO 04-22 00:57:45 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 56.88x +(Worker_TP1 pid=444) DEBUG 04-22 00:57:45 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=443) DEBUG 04-22 00:57:45 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=443) 2026-04-22 00:57:45,967 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP1 pid=444) 2026-04-22 00:57:45,967 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP1 pid=444) DEBUG 04-22 00:57:45 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=443) DEBUG 04-22 00:57:45 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=444) 2026-04-22 00:57:45,993 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP0 pid=443) 2026-04-22 00:57:45,994 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP0 pid=443) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=244) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=244) DEBUG 04-22 00:57:57 [config/vllm.py:1530] Max num batched tokens below allreduce-rms fusion threshold, allreduce-rms fusion will be enabled for all num_tokens. +(EngineCore pid=244) INFO 04-22 00:57:57 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(Worker_TP0 pid=443) DEBUG 04-22 00:57:57 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=444) DEBUG 04-22 00:57:57 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=244) DEBUG 04-22 00:57:57 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=244) DEBUG 04-22 00:57:57 [v1/engine/core.py:1158] EngineCore waiting for work. +(EngineCore pid=244) DEBUG 04-22 00:57:57 [v1/engine/core.py:1158] EngineCore waiting for work. +(APIServer pid=1) INFO 04-22 00:57:57 [entrypoints/openai/api_server.py:590] Supported tasks: ['generate'] +(APIServer pid=1) DEBUG 04-22 00:57:58 [renderers/base.py:197] Warming up chat template processing... +(APIServer pid=1) DEBUG 04-22 00:57:58 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e81d16-5a939a2440aa6b226ae3b22d;c34969b7-02e8-49b8-bf74-f77fa7c257a1) +(APIServer pid=1) DEBUG 04-22 00:57:58 [transformers_utils/repo_utils.py:243] +(APIServer pid=1) DEBUG 04-22 00:57:58 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1/resolve/main/processor_config.json. +(APIServer pid=1) DEBUG 04-22 00:57:58 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e81d16-16d7d3f56520faa20bc74780;0f1891c1-6bf7-452a-a1b3-3d65bb07b6c6) +(APIServer pid=1) DEBUG 04-22 00:57:58 [transformers_utils/repo_utils.py:243] +(APIServer pid=1) DEBUG 04-22 00:57:58 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1/resolve/main/preprocessor_config.json. +(Worker_TP0 pid=443) DEBUG 04-22 00:57:58 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=444) DEBUG 04-22 00:57:58 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(APIServer pid=1) INFO 04-22 00:57:58 [renderers/hf.py:314] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this. +(APIServer pid=1) DEBUG 04-22 00:57:58 [renderers/base.py:203] Chat template warmup completed in 0.592s +(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/openai/api_server.py:594] Starting vLLM server on http://0.0.0.0:8000 +(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:37] Available routes are: +(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /openapi.json, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /docs, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /docs/oauth2-redirect, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /redoc, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /tokenize, Methods: POST +(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /detokenize, Methods: POST +(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /load, Methods: GET +(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /version, Methods: GET +(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /health, Methods: GET +(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /metrics, Methods: GET +(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /v1/models, Methods: GET +(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /ping, Methods: GET +(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /ping, Methods: POST +(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /invocations, Methods: POST +(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /v1/chat/completions, Methods: POST +(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST +(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /v1/responses, Methods: POST +(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET +(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST +(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /v1/completions, Methods: POST +(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /v1/messages, Methods: POST +(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST +(APIServer pid=1) INFO 04-22 00:57:59 [entrypoints/launcher.py:46] Route: /inference/v1/generate, Methods: POST +(APIServer pid=1) INFO 04-22 00:57:59 [entrypoints/launcher.py:46] Route: /scale_elastic_ep, Methods: POST +(APIServer pid=1) INFO 04-22 00:57:59 [entrypoints/launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST +(APIServer pid=1) INFO 04-22 00:57:59 [entrypoints/launcher.py:46] Route: /v1/chat/completions/render, Methods: POST +(APIServer pid=1) INFO 04-22 00:57:59 [entrypoints/launcher.py:46] Route: /v1/completions/render, Methods: POST +(APIServer pid=1) INFO: Started server process [1] +(APIServer pid=1) INFO: Waiting for application startup. +(APIServer pid=1) INFO: Application startup complete. +(APIServer pid=1) DEBUG 04-22 00:58:03 [v1/engine/async_llm.py:875] Called check_health. +(APIServer pid=1) INFO: 10.131.6.2:57516 - "GET /health HTTP/1.1" 200 OK diff --git a/accuracy/results/v0.19.0/logs/moonshotai-kimi-dev-72b--h100-80gb--tp2pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/moonshotai-kimi-dev-72b--h100-80gb--tp2pp1dp1--8192.log new file mode 100644 index 00000000..6d2e1de1 --- /dev/null +++ b/accuracy/results/v0.19.0/logs/moonshotai-kimi-dev-72b--h100-80gb--tp2pp1dp1--8192.log @@ -0,0 +1,1565 @@ +DEBUG 04-21 23:59:08 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-21 23:59:08 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-21 23:59:08 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-21 23:59:08 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-21 23:59:08 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-21 23:59:08 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-21 23:59:08 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-21 23:59:08 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-21 23:59:08 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-21 23:59:08 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-21 23:59:08 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-21 23:59:08 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-21 23:59:13 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-21 23:59:14 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' +DEBUG 04-21 23:59:14 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-21 23:59:14 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-21 23:59:14 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-21 23:59:14 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-21 23:59:14 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-21 23:59:14 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-21 23:59:14 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-21 23:59:14 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model moonshotai/Kimi-Dev-72B +(APIServer pid=1) INFO 04-21 23:59:14 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-21 23:59:14 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-21 23:59:14 [entrypoints/utils.py:233] non-default args: {'model_tag': 'moonshotai/Kimi-Dev-72B', 'model': 'moonshotai/Kimi-Dev-72B', 'trust_remote_code': True, 'max_model_len': 8192, 'tensor_parallel_size': 2, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-21 23:59:14 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-21 23:59:15 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen2.Qwen2ForCausalLM from cache +(APIServer pid=1) DEBUG 04-21 23:59:15 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0011903 secs +(APIServer pid=1) INFO 04-21 23:59:15 [config/model.py:549] Resolved architecture: Qwen2ForCausalLM +(APIServer pid=1) INFO 04-21 23:59:15 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-21 23:59:15 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-21 23:59:15 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-21 23:59:15 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-21 23:59:15 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-21 23:59:15 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-21 23:59:15 [config/parallel.py:743] Defaulting to use mp for distributed inference +(APIServer pid=1) INFO 04-21 23:59:15 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-21 23:59:15 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(APIServer pid=1) INFO 04-21 23:59:16 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(APIServer pid=1) DEBUG 04-21 23:59:16 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-21 23:59:16 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-21 23:59:16 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-21 23:59:16 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-21 23:59:20 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-21 23:59:20 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-21 23:59:20 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-21 23:59:20 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-21 23:59:20 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-21 23:59:20 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-21 23:59:20 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-21 23:59:20 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-21 23:59:20 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-21 23:59:20 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-21 23:59:20 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-21 23:59:20 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-21 23:59:25 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=243) DEBUG 04-21 23:59:26 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-21 23:59:26 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=243) DEBUG 04-21 23:59:26 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/a6342252-7400-4e34-84be-903fde2e7e07'], outputs=['ipc:///tmp/4a1fa813-4215-4d18-96d1-bf099cceb399'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=243) DEBUG 04-21 23:59:26 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=243) DEBUG 04-21 23:59:26 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=243) DEBUG 04-21 23:59:26 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=243) DEBUG 04-21 23:59:26 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=243) DEBUG 04-21 23:59:26 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=243) INFO 04-21 23:59:26 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='moonshotai/Kimi-Dev-72B', speculative_config=None, tokenizer='moonshotai/Kimi-Dev-72B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=moonshotai/Kimi-Dev-72B, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [4096, 8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=243) WARNING 04-21 23:59:26 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. +(EngineCore pid=243) INFO 04-21 23:59:26 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.128.4.183 (local), world_size=2, local_world_size=2 +(EngineCore pid=243) DEBUG 04-21 23:59:26 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/cd9cd408-5342-4764-8f72-5cff1c2769cc +(EngineCore pid=243) DEBUG 04-21 23:59:26 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1], buffer_handle=(2, 16777216, 10, 'psm_1004733a'), local_subscribe_addr='ipc:///tmp/cd9cd408-5342-4764-8f72-5cff1c2769cc', local_notify_addr='ipc:///tmp/0e466130-897f-4cf5-a03f-2da8be229250', remote_subscribe_addr=None, remote_addr_ipv6=False) +DEBUG 04-21 23:59:30 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-21 23:59:30 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-21 23:59:30 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-21 23:59:30 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-21 23:59:30 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-21 23:59:30 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-21 23:59:30 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-21 23:59:30 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-21 23:59:30 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-21 23:59:30 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-21 23:59:30 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-21 23:59:30 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-21 23:59:30 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-21 23:59:30 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-21 23:59:30 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-21 23:59:30 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-21 23:59:30 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-21 23:59:30 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-21 23:59:30 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-21 23:59:30 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-21 23:59:30 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-21 23:59:30 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-21 23:59:30 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-21 23:59:30 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-21 23:59:35 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-21 23:59:35 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-21 23:59:36 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-21 23:59:36 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-21 23:59:36 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-21 23:59:36 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) DEBUG 04-21 23:59:36 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +DEBUG 04-21 23:59:36 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-21 23:59:36 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-21 23:59:36 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-21 23:59:36 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(Worker pid=443) DEBUG 04-21 23:59:36 [distributed/parallel_state.py:1356] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:57857 backend=nccl +(Worker pid=443) INFO 04-21 23:59:36 [distributed/parallel_state.py:1400] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:57857 backend=nccl +(Worker pid=442) DEBUG 04-21 23:59:37 [distributed/parallel_state.py:1356] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:57857 backend=nccl +(Worker pid=442) INFO 04-21 23:59:37 [distributed/parallel_state.py:1400] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:57857 backend=nccl +[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +(Worker pid=442) DEBUG 04-21 23:59:37 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=443) DEBUG 04-21 23:59:37 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +(Worker pid=442) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=442) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=443) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=443) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=442) DEBUG 04-21 23:59:38 [utils/nccl.py:34] Found nccl from library libnccl.so.2 +(Worker pid=442) INFO 04-21 23:59:38 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 +(Worker pid=443) DEBUG 04-21 23:59:38 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=442) DEBUG 04-21 23:59:38 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=442) DEBUG 04-21 23:59:38 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/8bdb0334-b32f-4726-8ef3-de6647788486 +(Worker pid=442) DEBUG 04-21 23:59:38 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1], buffer_handle=(1, 4194304, 6, 'psm_8c80a23b'), local_subscribe_addr='ipc:///tmp/8bdb0334-b32f-4726-8ef3-de6647788486', local_notify_addr='ipc:///tmp/7d634166-ae36-4d6c-909d-eaddce568a1e', remote_subscribe_addr=None, remote_addr_ipv6=False) +(Worker pid=443) DEBUG 04-21 23:59:38 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/8bdb0334-b32f-4726-8ef3-de6647788486 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(Worker pid=442) INFO 04-21 23:59:38 [distributed/parallel_state.py:1716] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(Worker pid=442) DEBUG 04-21 23:59:39 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.66GiB, total_memory=79.19GiB, cuda_memory=1.53GiB, torch_memory=0.02GiB, non_torch_memory=1.51GiB, timestamp=1776815979.1975927, auto_measure=True +(Worker pid=442) DEBUG 04-21 23:59:39 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=443) DEBUG 04-21 23:59:39 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.66GiB, total_memory=79.19GiB, cuda_memory=1.53GiB, torch_memory=0.02GiB, non_torch_memory=1.51GiB, timestamp=1776815979.206957, auto_measure=True +(Worker pid=443) DEBUG 04-21 23:59:39 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=442) DEBUG 04-21 23:59:39 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=442) DEBUG 04-21 23:59:39 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=443) DEBUG 04-21 23:59:39 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=443) DEBUG 04-21 23:59:39 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=442) DEBUG 04-21 23:59:39 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=443) DEBUG 04-21 23:59:39 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=442) DEBUG 04-21 23:59:39 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=443) DEBUG 04-21 23:59:39 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=442) DEBUG 04-21 23:59:39 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(Worker pid=442) DEBUG 04-21 23:59:39 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=443) DEBUG 04-21 23:59:39 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=442) DEBUG 04-21 23:59:39 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(Worker_TP0 pid=442) INFO 04-21 23:59:39 [v1/worker/gpu_model_runner.py:4735] Starting to load model moonshotai/Kimi-Dev-72B... +(Worker_TP0 pid=442) DEBUG 04-21 23:59:39 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(Worker_TP0 pid=442) INFO 04-21 23:59:39 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(Worker_TP0 pid=442) INFO 04-21 23:59:39 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(Worker_TP1 pid=443) DEBUG 04-21 23:59:40 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP1 pid=443) DEBUG 04-21 23:59:40 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP1 pid=443) DEBUG 04-21 23:59:40 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP1 pid=443) DEBUG 04-21 23:59:40 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP0 pid=442) DEBUG 04-21 23:59:40 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP0 pid=442) DEBUG 04-21 23:59:40 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP0 pid=442) DEBUG 04-21 23:59:40 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP0 pid=442) DEBUG 04-21 23:59:40 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP1 pid=443) DEBUG 04-21 23:59:40 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-79-of-80.safetensors', 'model-27-of-80.safetensors', 'model-25-of-80.safetensors', 'model-46-of-80.safetensors', 'model-17-of-80.safetensors', 'model-8-of-80.safetensors', 'model-77-of-80.safetensors', 'model-41-of-80.safetensors', 'model-31-of-80.safetensors', 'model-39-of-80.safetensors', 'model-48-of-80.safetensors', 'model-47-of-80.safetensors', 'model-18-of-80.safetensors', 'model-66-of-80.safetensors', 'model-68-of-80.safetensors', 'model-21-of-80.safetensors', 'model-28-of-80.safetensors', 'model-62-of-80.safetensors', 'model-12-of-80.safetensors', 'model-51-of-80.safetensors', 'model-15-of-80.safetensors', 'model-34-of-80.safetensors', 'model-38-of-80.safetensors', 'model-61-of-80.safetensors', 'model-29-of-80.safetensors', 'model-35-of-80.safetensors', 'model-10-of-80.safetensors', 'model-54-of-80.safetensors', 'model-78-of-80.safetensors', 'model-55-of-80.safetensors', 'model-65-of-80.safetensors', 'model-24-of-80.safetensors', 'model-11-of-80.safetensors', 'model-53-of-80.safetensors', 'model-4-of-80.safetensors', 'model-2-of-80.safetensors', 'model-32-of-80.safetensors', 'model-74-of-80.safetensors', 'model-70-of-80.safetensors', 'model-14-of-80.safetensors', 'model-5-of-80.safetensors', 'model-43-of-80.safetensors', 'model-22-of-80.safetensors', 'model-72-of-80.safetensors', 'model-60-of-80.safetensors', 'model-33-of-80.safetensors', 'model-64-of-80.safetensors', 'model-42-of-80.safetensors', 'model-13-of-80.safetensors', 'model-75-of-80.safetensors', 'model-9-of-80.safetensors', 'model-56-of-80.safetensors', 'model-45-of-80.safetensors', 'model-44-of-80.safetensors', 'model-20-of-80.safetensors', 'model-71-of-80.safetensors', 'model-57-of-80.safetensors', 'model-76-of-80.safetensors', 'model-40-of-80.safetensors', 'model-19-of-80.safetensors', 'model-1-of-80.safetensors', 'model-36-of-80.safetensors', 'model-58-of-80.safetensors', 'model-7-of-80.safetensors', 'model-16-of-80.safetensors', 'model-30-of-80.safetensors', 'model-59-of-80.safetensors', 'model-67-of-80.safetensors', 'model-80-of-80.safetensors', 'model-3-of-80.safetensors', 'model-6-of-80.safetensors', 'model-52-of-80.safetensors', 'model-26-of-80.safetensors', 'model-37-of-80.safetensors', 'model-49-of-80.safetensors', 'model-23-of-80.safetensors', 'model-73-of-80.safetensors', 'model-63-of-80.safetensors', 'model-69-of-80.safetensors', 'model-50-of-80.safetensors']] +(Worker_TP0 pid=442) DEBUG 04-21 23:59:40 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-65-of-80.safetensors', 'model-40-of-80.safetensors', 'model-66-of-80.safetensors', 'model-9-of-80.safetensors', 'model-68-of-80.safetensors', 'model-27-of-80.safetensors', 'model-15-of-80.safetensors', 'model-11-of-80.safetensors', 'model-76-of-80.safetensors', 'model-49-of-80.safetensors', 'model-21-of-80.safetensors', 'model-13-of-80.safetensors', 'model-73-of-80.safetensors', 'model-28-of-80.safetensors', 'model-52-of-80.safetensors', 'model-25-of-80.safetensors', 'model-69-of-80.safetensors', 'model-71-of-80.safetensors', 'model-75-of-80.safetensors', 'model-30-of-80.safetensors', 'model-45-of-80.safetensors', 'model-32-of-80.safetensors', 'model-64-of-80.safetensors', 'model-37-of-80.safetensors', 'model-1-of-80.safetensors', 'model-14-of-80.safetensors', 'model-60-of-80.safetensors', 'model-36-of-80.safetensors', 'model-39-of-80.safetensors', 'model-48-of-80.safetensors', 'model-74-of-80.safetensors', 'model-62-of-80.safetensors', 'model-77-of-80.safetensors', 'model-72-of-80.safetensors', 'model-58-of-80.safetensors', 'model-67-of-80.safetensors', 'model-19-of-80.safetensors', 'model-23-of-80.safetensors', 'model-59-of-80.safetensors', 'model-24-of-80.safetensors', 'model-29-of-80.safetensors', 'model-18-of-80.safetensors', 'model-4-of-80.safetensors', 'model-33-of-80.safetensors', 'model-5-of-80.safetensors', 'model-78-of-80.safetensors', 'model-22-of-80.safetensors', 'model-10-of-80.safetensors', 'model-55-of-80.safetensors', 'model-16-of-80.safetensors', 'model-3-of-80.safetensors', 'model-43-of-80.safetensors', 'model-31-of-80.safetensors', 'model-7-of-80.safetensors', 'model-41-of-80.safetensors', 'model-53-of-80.safetensors', 'model-8-of-80.safetensors', 'model-80-of-80.safetensors', 'model-35-of-80.safetensors', 'model-56-of-80.safetensors', 'model-2-of-80.safetensors', 'model-17-of-80.safetensors', 'model-46-of-80.safetensors', 'model-34-of-80.safetensors', 'model-54-of-80.safetensors', 'model-79-of-80.safetensors', 'model-26-of-80.safetensors', 'model-47-of-80.safetensors', 'model-50-of-80.safetensors', 'model-42-of-80.safetensors', 'model-12-of-80.safetensors', 'model-6-of-80.safetensors', 'model-63-of-80.safetensors', 'model-57-of-80.safetensors', 'model-44-of-80.safetensors', 'model-20-of-80.safetensors', 'model-61-of-80.safetensors', 'model-51-of-80.safetensors', 'model-70-of-80.safetensors', 'model-38-of-80.safetensors']] +(Worker_TP0 pid=442) Loading safetensors checkpoint shards: 0% Completed | 0/80 [00:00 +(Worker_TP1 pid=443) DEBUG 04-22 00:01:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=443) DEBUG 04-22 00:01:25 [compilation/decorators.py:528] Start compiling function +(EngineCore pid=243) DEBUG 04-22 00:01:26 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(APIServer pid=1) DEBUG 04-22 00:01:26 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP0 pid=442) INFO 04-22 00:01:34 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/0288147631/rank_0_0/backbone for vLLM's torch.compile +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=defe415cac comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/0288147631/rank_0_0/backbone +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] Vllm config hash: defe415cac +(Worker_TP0 pid=442) INFO 04-22 00:01:34 [compilation/backends.py:1111] Dynamo bytecode transform time: 8.75 s +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/.../fusion/allreduce_rms_fusion.py:765] Flashinfer max size: 64 MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: 4096 +(Worker_TP0 pid=442) INFO 04-22 00:01:34 [distributed/device_communicators/flashinfer_all_reduce.py:109] Auto-selected flashinfer allreduce backend: trtllm +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=defe415cac comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/0288147631/rank_1_0/backbone +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] Vllm config hash: defe415cac +(Worker_TP0 pid=442) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. +(Worker_TP0 pid=442) return func(*args, **kwargs) +(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=0, max_token_num=4096, hidden_dim=8192, dtype=torch.bfloat16 +(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=1, max_token_num=4096, hidden_dim=8192, dtype=torch.bfloat16 +(Worker_TP0 pid=442) INFO 04-22 00:01:34 [distributed/device_communicators/flashinfer_all_reduce.py:149] Initialized FlashInfer Allreduce norm fusion workspace with backend=trtllm +(Worker_TP0 pid=442) DEBUG 04-22 00:01:35 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 4096), (4097, 8192)] +(Worker_TP0 pid=442) DEBUG 04-22 00:01:35 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(Worker_TP0 pid=442) DEBUG 04-22 00:01:35 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=442) DEBUG 04-22 00:01:35 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:01:36 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns +(Worker_TP0 pid=442) DEBUG 04-22 00:01:36 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 34.6 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:01:36 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:01:36 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes +(Worker_TP0 pid=442) DEBUG 04-22 00:01:36 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:01:36 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=443) DEBUG 04-22 00:01:36 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:01:36 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns +(Worker_TP1 pid=443) DEBUG 04-22 00:01:36 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 34.9 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:01:36 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:01:36 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes +(Worker_TP1 pid=443) DEBUG 04-22 00:01:36 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(APIServer pid=1) DEBUG 04-22 00:01:36 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=442) INFO 04-22 00:01:38 [compilation/backends.py:372] Cache the graph of compile range (1, 4096) for later use +(Worker_TP0 pid=442) DEBUG 04-22 00:01:38 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 4096) from inductor_standalone via handle ('artifact_compile_range_1_4096_subgraph_0', '/data/.cache/vllm/torch_compile_cache/0288147631/rank_0_0/backbone/artifact_compile_range_1_4096_subgraph_0') +(Worker_TP0 pid=442) DEBUG 04-22 00:01:38 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=442) DEBUG 04-22 00:01:38 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:01:38 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) +(Worker_TP0 pid=442) DEBUG 04-22 00:01:38 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:01:38 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=442) DEBUG 04-22 00:01:38 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:01:38 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=443) DEBUG 04-22 00:01:38 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:01:38 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) +(Worker_TP1 pid=443) DEBUG 04-22 00:01:38 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:01:38 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=443) DEBUG 04-22 00:01:38 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP0 pid=442) INFO 04-22 00:01:39 [compilation/backends.py:372] Cache the graph of compile range (4097, 8192) for later use +(Worker_TP0 pid=442) DEBUG 04-22 00:01:39 [compilation/backends.py:377] Store the 0-th graph for compile range(4097, 8192) from inductor_standalone via handle ('artifact_compile_range_4097_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/0288147631/rank_0_0/backbone/artifact_compile_range_4097_8192_subgraph_0') +(Worker_TP0 pid=442) DEBUG 04-22 00:01:39 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=442) DEBUG 04-22 00:01:39 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:01:39 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP0 pid=442) DEBUG 04-22 00:01:39 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 61.3 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:01:39 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:01:39 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP0 pid=442) DEBUG 04-22 00:01:39 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:01:40 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=443) DEBUG 04-22 00:01:40 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:01:40 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP1 pid=443) DEBUG 04-22 00:01:40 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 62.4 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:01:40 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:01:40 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP1 pid=443) DEBUG 04-22 00:01:40 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:01:40 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 4096) from inductor_standalone via handle ('artifact_compile_range_1_4096_subgraph_1', '/data/.cache/vllm/torch_compile_cache/0288147631/rank_0_0/backbone/artifact_compile_range_1_4096_subgraph_1') +(Worker_TP0 pid=442) DEBUG 04-22 00:01:41 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=442) DEBUG 04-22 00:01:41 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:01:41 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) +(Worker_TP0 pid=442) DEBUG 04-22 00:01:41 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:01:41 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=442) DEBUG 04-22 00:01:41 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:01:41 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=443) DEBUG 04-22 00:01:41 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:01:41 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) +(Worker_TP1 pid=443) DEBUG 04-22 00:01:41 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:01:41 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=443) DEBUG 04-22 00:01:41 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:01:42 [compilation/backends.py:377] Store the 1-th graph for compile range(4097, 8192) from inductor_standalone via handle ('artifact_compile_range_4097_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/0288147631/rank_0_0/backbone/artifact_compile_range_4097_8192_subgraph_1') +(APIServer pid=1) DEBUG 04-22 00:01:46 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=442) DEBUG 04-22 00:01:48 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=442) DEBUG 04-22 00:01:48 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:01:48 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP0 pid=442) DEBUG 04-22 00:01:48 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 61.5 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:01:48 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:01:48 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP0 pid=442) DEBUG 04-22 00:01:48 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:01:48 [compilation/backends.py:377] Store the 80-th graph for compile range(1, 4096) from inductor_standalone via handle ('artifact_compile_range_1_4096_subgraph_80', '/data/.cache/vllm/torch_compile_cache/0288147631/rank_0_0/backbone/artifact_compile_range_1_4096_subgraph_80') +(Worker_TP0 pid=442) INFO 04-22 00:01:48 [compilation/backends.py:390] Compiling a graph for compile range (1, 4096) takes 9.96 s +(Worker_TP0 pid=442) DEBUG 04-22 00:01:48 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=442) DEBUG 04-22 00:01:48 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:01:48 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) +(Worker_TP0 pid=442) DEBUG 04-22 00:01:48 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:01:48 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=442) DEBUG 04-22 00:01:48 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:01:48 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=443) DEBUG 04-22 00:01:48 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:01:48 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP1 pid=443) DEBUG 04-22 00:01:48 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 62.8 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:01:48 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:01:48 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP1 pid=443) DEBUG 04-22 00:01:48 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:01:49 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=443) DEBUG 04-22 00:01:49 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:01:49 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) +(Worker_TP1 pid=443) DEBUG 04-22 00:01:49 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:01:49 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=443) DEBUG 04-22 00:01:49 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:01:49 [compilation/backends.py:377] Store the 80-th graph for compile range(4097, 8192) from inductor_standalone via handle ('artifact_compile_range_4097_8192_subgraph_80', '/data/.cache/vllm/torch_compile_cache/0288147631/rank_0_0/backbone/artifact_compile_range_4097_8192_subgraph_80') +(Worker_TP0 pid=442) INFO 04-22 00:01:49 [compilation/backends.py:390] Compiling a graph for compile range (4097, 8192) takes 10.85 s +(Worker_TP0 pid=442) DEBUG 04-22 00:01:49 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/0288147631/rank_0_0/backbone/computation_graph.py +(Worker_TP0 pid=442) INFO 04-22 00:01:52 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/d910a456355fd0e3a8ef08216e2a64de8c7290dd49f50f279a253f6d2d652529/rank_0_0/model +(Worker_TP0 pid=442) INFO 04-22 00:01:52 [compilation/monitor.py:48] torch.compile took 26.70 s in total +(Worker_TP0 pid=442) INFO 04-22 00:01:53 [compilation/monitor.py:76] Initial profiling/warmup run took 1.95 s +(APIServer pid=1) DEBUG 04-22 00:01:56 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP1 pid=443) INFO 04-22 00:01:59 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP0 pid=442) INFO 04-22 00:01:59 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP1 pid=443) DEBUG 04-22 00:01:59 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP1 pid=443) INFO 04-22 00:01:59 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_TP0 pid=442) DEBUG 04-22 00:01:59 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP0 pid=442) INFO 04-22 00:01:59 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_TP1 pid=443) DEBUG 04-22 00:01:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) DEBUG 04-22 00:01:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=443) DEBUG 04-22 00:01:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) DEBUG 04-22 00:01:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=443) DEBUG 04-22 00:01:59 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=442) DEBUG 04-22 00:01:59 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=442) DEBUG 04-22 00:02:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=443) DEBUG 04-22 00:02:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) DEBUG 04-22 00:02:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) DEBUG 04-22 00:02:00 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=443) DEBUG 04-22 00:02:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=443) DEBUG 04-22 00:02:00 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=442) DEBUG 04-22 00:02:00 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 152.00 MiB first-capture + (51-1) × 18.00 MiB per-graph +(Worker_TP0 pid=442) DEBUG 04-22 00:02:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=443) DEBUG 04-22 00:02:00 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 152.00 MiB first-capture + (51-1) × 18.00 MiB per-graph +(Worker_TP1 pid=443) DEBUG 04-22 00:02:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) DEBUG 04-22 00:02:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) DEBUG 04-22 00:02:00 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=443) DEBUG 04-22 00:02:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=443) DEBUG 04-22 00:02:00 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=442) DEBUG 04-22 00:02:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=443) DEBUG 04-22 00:02:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) DEBUG 04-22 00:02:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) DEBUG 04-22 00:02:00 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=443) DEBUG 04-22 00:02:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=443) DEBUG 04-22 00:02:00 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=442) DEBUG 04-22 00:02:00 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 266.00 MiB first-capture + (51-1) × 10.00 MiB per-graph +(Worker_TP0 pid=442) INFO 04-22 00:02:00 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses +(Worker_TP1 pid=443) DEBUG 04-22 00:02:00 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 266.00 MiB first-capture + (51-1) × 10.00 MiB per-graph +(Worker_TP1 pid=443) INFO 04-22 00:02:00 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses +(Worker_TP0 pid=442) DEBUG 04-22 00:02:01 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP0 pid=442) INFO 04-22 00:02:01 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.63 GiB total +(Worker_TP1 pid=443) DEBUG 04-22 00:02:01 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP1 pid=443) INFO 04-22 00:02:01 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.63 GiB total +(Worker_TP1 pid=443) DEBUG 04-22 00:02:01 [v1/worker/gpu_worker.py:424] Initial free memory: 77.66 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP1 pid=443) DEBUG 04-22 00:02:01 [v1/worker/gpu_worker.py:430] Free memory after profiling: 6.35 GiB (total), 3.93 GiB (within requested) +(Worker_TP1 pid=443) DEBUG 04-22 00:02:01 [v1/worker/gpu_worker.py:435] Memory profiling takes 36.19 seconds. Total non KV cache memory: 72.21GiB; torch peak memory increase: 2.29GiB; non-torch forward increase memory: 2.09GiB; weights memory: 67.82GiB. +(Worker_TP1 pid=443) INFO 04-22 00:02:01 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9705 to maintain the same effective KV cache size. +(Worker_TP1 pid=443) DEBUG 04-22 00:02:01 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=442) DEBUG 04-22 00:02:01 [v1/worker/gpu_worker.py:424] Initial free memory: 77.66 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP0 pid=442) DEBUG 04-22 00:02:01 [v1/worker/gpu_worker.py:430] Free memory after profiling: 6.35 GiB (total), 3.93 GiB (within requested) +(Worker_TP0 pid=442) DEBUG 04-22 00:02:01 [v1/worker/gpu_worker.py:435] Memory profiling takes 36.26 seconds. Total non KV cache memory: 72.21GiB; torch peak memory increase: 2.29GiB; non-torch forward increase memory: 2.09GiB; weights memory: 67.82GiB. +(Worker_TP0 pid=442) INFO 04-22 00:02:01 [v1/worker/gpu_worker.py:436] Available KV cache memory: 3.02 GiB +(Worker_TP0 pid=442) INFO 04-22 00:02:01 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9705 to maintain the same effective KV cache size. +(Worker_TP0 pid=442) DEBUG 04-22 00:02:01 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=243) DEBUG 04-22 00:02:01 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=243) INFO 04-22 00:02:01 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 19,776 tokens +(EngineCore pid=243) INFO 04-22 00:02:01 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 2.41x +(Worker_TP0 pid=442) DEBUG 04-22 00:02:01 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=443) DEBUG 04-22 00:02:01 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=443) INFO 04-22 00:02:01 [v1/worker/gpu_worker.py:578] Compile and warming up model for size 8192 +(Worker_TP0 pid=442) INFO 04-22 00:02:01 [v1/worker/gpu_worker.py:578] Compile and warming up model for size 8192 +(Worker_TP1 pid=443) DEBUG 04-22 00:02:01 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) DEBUG 04-22 00:02:01 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=443) 2026-04-22 00:02:01,768 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP0 pid=442) 2026-04-22 00:02:01,768 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP1 pid=443) DEBUG 04-22 00:02:01 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) DEBUG 04-22 00:02:01 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:02:02 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=442) 2026-04-22 00:02:02,650 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP1 pid=443) 2026-04-22 00:02:02,650 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP0 pid=442) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=243) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=243) INFO 04-22 00:02:14 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(Worker_TP1 pid=443) DEBUG 04-22 00:02:14 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=442) DEBUG 04-22 00:02:14 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=243) DEBUG 04-22 00:02:14 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=243) DEBUG 04-22 00:02:14 [v1/engine/core.py:1158] EngineCore waiting for work. +(EngineCore pid=243) DEBUG 04-22 00:02:14 [v1/engine/core.py:1158] EngineCore waiting for work. +(APIServer pid=1) INFO 04-22 00:02:14 [entrypoints/openai/api_server.py:590] Supported tasks: ['generate'] +(APIServer pid=1) DEBUG 04-22 00:02:15 [renderers/base.py:197] Warming up chat template processing... +(APIServer pid=1) DEBUG 04-22 00:02:15 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e81007-4b54106943b22e9725ac587c;f5e0296d-657f-4ca7-b92e-fc2b46bcc673) +(APIServer pid=1) DEBUG 04-22 00:02:15 [transformers_utils/repo_utils.py:243] +(APIServer pid=1) DEBUG 04-22 00:02:15 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/moonshotai/Kimi-Dev-72B/resolve/main/processor_config.json. +(APIServer pid=1) DEBUG 04-22 00:02:15 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e81007-0795e5ec52136b1d03088420;101c0272-f6f3-4106-9a59-ba75fbc2f711) +(APIServer pid=1) DEBUG 04-22 00:02:15 [transformers_utils/repo_utils.py:243] +(APIServer pid=1) DEBUG 04-22 00:02:15 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/moonshotai/Kimi-Dev-72B/resolve/main/preprocessor_config.json. +(Worker_TP0 pid=442) DEBUG 04-22 00:02:15 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=443) DEBUG 04-22 00:02:15 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(APIServer pid=1) INFO 04-22 00:02:15 [renderers/hf.py:314] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this. +(APIServer pid=1) DEBUG 04-22 00:02:15 [renderers/base.py:203] Chat template warmup completed in 0.740s +(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/openai/api_server.py:594] Starting vLLM server on http://0.0.0.0:8000 +(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:37] Available routes are: +(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /openapi.json, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /docs, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /docs/oauth2-redirect, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /redoc, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /tokenize, Methods: POST +(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /detokenize, Methods: POST +(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /load, Methods: GET +(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /version, Methods: GET +(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /health, Methods: GET +(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /metrics, Methods: GET +(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /v1/models, Methods: GET +(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /ping, Methods: GET +(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /ping, Methods: POST +(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /invocations, Methods: POST +(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /v1/chat/completions, Methods: POST +(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST +(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /v1/responses, Methods: POST +(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET +(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST +(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /v1/completions, Methods: POST +(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /v1/messages, Methods: POST +(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST +(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /inference/v1/generate, Methods: POST +(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /scale_elastic_ep, Methods: POST +(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST +(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /v1/chat/completions/render, Methods: POST +(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /v1/completions/render, Methods: POST +(APIServer pid=1) INFO: Started server process [1] +(APIServer pid=1) INFO: Waiting for application startup. +(APIServer pid=1) INFO: Application startup complete. +(APIServer pid=1) DEBUG 04-22 00:02:23 [v1/engine/async_llm.py:875] Called check_health. +(APIServer pid=1) INFO: 10.128.4.2:33468 - "GET /health HTTP/1.1" 200 OK diff --git a/accuracy/results/v0.19.0/logs/moonshotai-kimi-dev-72b--h100-80gb--tp4pp1dp1--8192.FAILED.log b/accuracy/results/v0.19.0/logs/moonshotai-kimi-dev-72b--h100-80gb--tp4pp1dp1--8192.FAILED.log new file mode 100644 index 00000000..cd8bb00d --- /dev/null +++ b/accuracy/results/v0.19.0/logs/moonshotai-kimi-dev-72b--h100-80gb--tp4pp1dp1--8192.FAILED.log @@ -0,0 +1,399 @@ +DEBUG 04-22 00:02:38 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:02:38 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:02:38 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:02:38 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:02:38 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:02:38 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:02:38 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:02:38 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:02:38 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:02:38 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:02:38 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:02:38 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:02:43 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 00:02:45 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' +DEBUG 04-22 00:02:45 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 00:02:45 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:02:45 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:02:45 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 00:02:45 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:02:45 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 00:02:45 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 00:02:45 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model moonshotai/Kimi-Dev-72B +(APIServer pid=1) INFO 04-22 00:02:45 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 00:02:45 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:02:45 [entrypoints/utils.py:233] non-default args: {'model_tag': 'moonshotai/Kimi-Dev-72B', 'model': 'moonshotai/Kimi-Dev-72B', 'trust_remote_code': True, 'max_model_len': 8192, 'tensor_parallel_size': 4, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 00:02:45 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 00:02:45 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen2.Qwen2ForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 00:02:45 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0041262 secs +(APIServer pid=1) INFO 04-22 00:02:45 [config/model.py:549] Resolved architecture: Qwen2ForCausalLM +(APIServer pid=1) INFO 04-22 00:02:45 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 00:02:45 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 00:02:45 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 00:02:45 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 00:02:45 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 00:02:45 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 00:02:45 [config/parallel.py:743] Defaulting to use mp for distributed inference +(APIServer pid=1) INFO 04-22 00:02:45 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 00:02:45 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(APIServer pid=1) INFO 04-22 00:02:47 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(APIServer pid=1) DEBUG 04-22 00:02:47 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 00:02:47 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:02:47 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:02:47 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 00:02:51 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:02:51 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:02:51 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:02:51 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:02:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:02:51 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:02:51 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:02:51 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:02:51 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:02:51 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:02:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:02:51 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:02:56 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=245) DEBUG 04-22 00:02:57 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 00:02:57 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=245) DEBUG 04-22 00:02:57 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/24a4d7cd-445c-4acd-85bf-2b111701ad2a'], outputs=['ipc:///tmp/a8007d5e-3590-4db5-b7d9-eb2caffb59d6'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=245) DEBUG 04-22 00:02:57 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=245) DEBUG 04-22 00:02:57 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=245) DEBUG 04-22 00:02:57 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=245) DEBUG 04-22 00:02:57 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=245) DEBUG 04-22 00:02:57 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=245) INFO 04-22 00:02:57 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='moonshotai/Kimi-Dev-72B', speculative_config=None, tokenizer='moonshotai/Kimi-Dev-72B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=4, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=moonshotai/Kimi-Dev-72B, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [128, 8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=245) WARNING 04-22 00:02:57 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. +(EngineCore pid=245) INFO 04-22 00:02:57 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.131.7.77 (local), world_size=4, local_world_size=4 +(EngineCore pid=245) DEBUG 04-22 00:02:57 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/0da9bf7e-69ce-4b6c-a385-5dc105766277 +(EngineCore pid=245) DEBUG 04-22 00:02:57 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1, 2, 3], buffer_handle=(4, 16777216, 10, 'psm_d1ae5a57'), local_subscribe_addr='ipc:///tmp/0da9bf7e-69ce-4b6c-a385-5dc105766277', local_notify_addr='ipc:///tmp/74d47614-c921-443b-b79b-34b5af61b7a5', remote_subscribe_addr=None, remote_addr_ipv6=False) +DEBUG 04-22 00:03:01 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:03:01 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:03:01 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:03:01 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:03:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:03:01 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:03:01 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:03:01 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:03:01 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:03:01 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:03:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:03:01 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:03:01 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:03:01 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:03:01 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:03:01 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:03:01 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:03:01 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:03:01 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:03:01 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:03:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:03:01 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:03:01 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:03:01 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:03:01 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:03:01 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:03:01 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:03:01 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:03:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:03:01 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:03:01 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:03:01 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:03:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:03:01 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:03:01 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:03:01 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:03:01 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:03:01 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:03:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:03:01 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:03:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:03:01 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:03:01 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:03:01 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:03:01 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:03:01 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:03:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:03:01 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:03:05 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 00:03:06 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 00:03:06 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 00:03:06 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 00:03:07 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 00:03:07 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:03:07 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:03:07 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 00:03:07 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 00:03:07 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:03:07 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:03:07 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 00:03:07 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 00:03:07 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:03:07 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:03:07 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 00:03:07 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 00:03:07 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:03:07 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:03:07 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) DEBUG 04-22 00:03:07 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker pid=447) DEBUG 04-22 00:03:08 [distributed/parallel_state.py:1356] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:49893 backend=nccl +(Worker pid=447) INFO 04-22 00:03:08 [distributed/parallel_state.py:1400] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:49893 backend=nccl +(Worker pid=444) DEBUG 04-22 00:03:08 [distributed/parallel_state.py:1356] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:49893 backend=nccl +(Worker pid=444) INFO 04-22 00:03:08 [distributed/parallel_state.py:1400] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:49893 backend=nccl +(Worker pid=446) DEBUG 04-22 00:03:08 [distributed/parallel_state.py:1356] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:49893 backend=nccl +(Worker pid=446) INFO 04-22 00:03:08 [distributed/parallel_state.py:1400] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:49893 backend=nccl +(Worker pid=445) DEBUG 04-22 00:03:08 [distributed/parallel_state.py:1356] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:49893 backend=nccl +(Worker pid=445) INFO 04-22 00:03:08 [distributed/parallel_state.py:1400] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:49893 backend=nccl +[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +(Worker pid=445) DEBUG 04-22 00:03:08 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=446) DEBUG 04-22 00:03:08 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=444) DEBUG 04-22 00:03:08 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=447) DEBUG 04-22 00:03:08 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +(Worker pid=444) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=445) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=444) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=445) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=446) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=446) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=447) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=447) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=444) DEBUG 04-22 00:03:08 [utils/nccl.py:34] Found nccl from library libnccl.so.2 +(Worker pid=444) INFO 04-22 00:03:08 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 +(Worker pid=445) DEBUG 04-22 00:03:09 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=444) DEBUG 04-22 00:03:09 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=446) DEBUG 04-22 00:03:09 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=447) DEBUG 04-22 00:03:09 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=444) DEBUG 04-22 00:03:09 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/2869822b-8e70-436a-bbad-a93e3e91136e +(Worker pid=444) DEBUG 04-22 00:03:09 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1, 2, 3], buffer_handle=(3, 4194304, 6, 'psm_2bf3a753'), local_subscribe_addr='ipc:///tmp/2869822b-8e70-436a-bbad-a93e3e91136e', local_notify_addr='ipc:///tmp/5140ae1d-9907-467e-83e9-4c140f16fe69', remote_subscribe_addr=None, remote_addr_ipv6=False) +(Worker pid=447) DEBUG 04-22 00:03:09 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/2869822b-8e70-436a-bbad-a93e3e91136e +(Worker pid=445) DEBUG 04-22 00:03:09 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/2869822b-8e70-436a-bbad-a93e3e91136e +(Worker pid=446) DEBUG 04-22 00:03:09 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/2869822b-8e70-436a-bbad-a93e3e91136e +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(Worker pid=444) INFO 04-22 00:03:09 [distributed/parallel_state.py:1716] rank 0 in world size 4 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(Worker pid=446) DEBUG 04-22 00:03:10 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776816190.2286594, auto_measure=True +(Worker pid=446) DEBUG 04-22 00:03:10 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=444) DEBUG 04-22 00:03:10 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776816190.2827969, auto_measure=True +(Worker pid=444) DEBUG 04-22 00:03:10 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] WorkerProc failed to start. +(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] Traceback (most recent call last): +(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 826, in worker_main +(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] worker = WorkerProc(*args, **kwargs) +(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) +(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ +(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 605, in __init__ +(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] self.worker.init_device() +(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 312, in init_device +(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] self.worker.init_device() # type: ignore +(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) +(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ +(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 283, in init_device +(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] self.requested_memory = request_memory(init_snapshot, self.cache_config) +(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/utils.py", line 413, in request_memory +(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] raise ValueError( +(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] ValueError: Free memory on device cuda:1 (60.85/79.19 GiB) on startup is less than desired GPU memory utilization (0.95, 75.23 GiB). Decrease GPU memory utilization or reduce GPU memory used by other processes. +(EngineCore pid=245) DEBUG 04-22 00:03:10 [v1/executor/multiproc_executor.py:419] Worker Termination: allow workers to gracefully shutdown +(Worker pid=447) DEBUG 04-22 00:03:10 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776816190.2981615, auto_measure=True +(Worker pid=447) DEBUG 04-22 00:03:10 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=446) DEBUG 04-22 00:03:10 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=446) DEBUG 04-22 00:03:10 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=446) DEBUG 04-22 00:03:10 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=446) DEBUG 04-22 00:03:10 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=444) DEBUG 04-22 00:03:10 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=444) DEBUG 04-22 00:03:10 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=446) DEBUG 04-22 00:03:10 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=444) DEBUG 04-22 00:03:10 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=447) DEBUG 04-22 00:03:10 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=447) DEBUG 04-22 00:03:10 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=447) DEBUG 04-22 00:03:10 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=444) DEBUG 04-22 00:03:10 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=444) DEBUG 04-22 00:03:10 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(Worker pid=444) DEBUG 04-22 00:03:10 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=447) DEBUG 04-22 00:03:10 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=447) DEBUG 04-22 00:03:10 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=444) DEBUG 04-22 00:03:10 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(Worker_TP0 pid=444) INFO 04-22 00:03:10 [v1/worker/gpu_model_runner.py:4735] Starting to load model moonshotai/Kimi-Dev-72B... +(Worker_TP0 pid=444) DEBUG 04-22 00:03:10 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(Worker_TP0 pid=444) INFO 04-22 00:03:10 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(Worker_TP0 pid=444) INFO 04-22 00:03:10 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(Worker_TP2 pid=446) DEBUG 04-22 00:03:11 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP2 pid=446) DEBUG 04-22 00:03:11 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP2 pid=446) DEBUG 04-22 00:03:11 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP2 pid=446) DEBUG 04-22 00:03:11 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP0 pid=444) DEBUG 04-22 00:03:11 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP3 pid=447) DEBUG 04-22 00:03:11 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP0 pid=444) DEBUG 04-22 00:03:11 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP0 pid=444) DEBUG 04-22 00:03:11 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP0 pid=444) DEBUG 04-22 00:03:11 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP3 pid=447) DEBUG 04-22 00:03:11 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP3 pid=447) DEBUG 04-22 00:03:11 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP3 pid=447) DEBUG 04-22 00:03:11 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP2 pid=446) DEBUG 04-22 00:03:11 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-10-of-80.safetensors', 'model-39-of-80.safetensors', 'model-15-of-80.safetensors', 'model-18-of-80.safetensors', 'model-14-of-80.safetensors', 'model-51-of-80.safetensors', 'model-66-of-80.safetensors', 'model-55-of-80.safetensors', 'model-21-of-80.safetensors', 'model-22-of-80.safetensors', 'model-36-of-80.safetensors', 'model-8-of-80.safetensors', 'model-79-of-80.safetensors', 'model-50-of-80.safetensors', 'model-40-of-80.safetensors', 'model-57-of-80.safetensors', 'model-30-of-80.safetensors', 'model-41-of-80.safetensors', 'model-32-of-80.safetensors', 'model-25-of-80.safetensors', 'model-24-of-80.safetensors', 'model-12-of-80.safetensors', 'model-77-of-80.safetensors', 'model-61-of-80.safetensors', 'model-73-of-80.safetensors', 'model-53-of-80.safetensors', 'model-4-of-80.safetensors', 'model-17-of-80.safetensors', 'model-38-of-80.safetensors', 'model-71-of-80.safetensors', 'model-6-of-80.safetensors', 'model-31-of-80.safetensors', 'model-64-of-80.safetensors', 'model-1-of-80.safetensors', 'model-54-of-80.safetensors', 'model-45-of-80.safetensors', 'model-35-of-80.safetensors', 'model-13-of-80.safetensors', 'model-37-of-80.safetensors', 'model-78-of-80.safetensors', 'model-72-of-80.safetensors', 'model-74-of-80.safetensors', 'model-3-of-80.safetensors', 'model-46-of-80.safetensors', 'model-29-of-80.safetensors', 'model-27-of-80.safetensors', 'model-20-of-80.safetensors', 'model-75-of-80.safetensors', 'model-63-of-80.safetensors', 'model-44-of-80.safetensors', 'model-26-of-80.safetensors', 'model-23-of-80.safetensors', 'model-65-of-80.safetensors', 'model-62-of-80.safetensors', 'model-70-of-80.safetensors', 'model-9-of-80.safetensors', 'model-56-of-80.safetensors', 'model-42-of-80.safetensors', 'model-5-of-80.safetensors', 'model-58-of-80.safetensors', 'model-47-of-80.safetensors', 'model-52-of-80.safetensors', 'model-80-of-80.safetensors', 'model-67-of-80.safetensors', 'model-69-of-80.safetensors', 'model-59-of-80.safetensors', 'model-19-of-80.safetensors', 'model-7-of-80.safetensors', 'model-49-of-80.safetensors', 'model-76-of-80.safetensors', 'model-68-of-80.safetensors', 'model-16-of-80.safetensors', 'model-43-of-80.safetensors', 'model-2-of-80.safetensors', 'model-28-of-80.safetensors', 'model-34-of-80.safetensors', 'model-33-of-80.safetensors', 'model-11-of-80.safetensors', 'model-60-of-80.safetensors', 'model-48-of-80.safetensors']] +(Worker_TP0 pid=444) DEBUG 04-22 00:03:11 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-62-of-80.safetensors', 'model-1-of-80.safetensors', 'model-33-of-80.safetensors', 'model-31-of-80.safetensors', 'model-9-of-80.safetensors', 'model-13-of-80.safetensors', 'model-58-of-80.safetensors', 'model-21-of-80.safetensors', 'model-2-of-80.safetensors', 'model-39-of-80.safetensors', 'model-63-of-80.safetensors', 'model-60-of-80.safetensors', 'model-29-of-80.safetensors', 'model-10-of-80.safetensors', 'model-54-of-80.safetensors', 'model-23-of-80.safetensors', 'model-75-of-80.safetensors', 'model-36-of-80.safetensors', 'model-64-of-80.safetensors', 'model-28-of-80.safetensors', 'model-67-of-80.safetensors', 'model-48-of-80.safetensors', 'model-49-of-80.safetensors', 'model-8-of-80.safetensors', 'model-45-of-80.safetensors', 'model-14-of-80.safetensors', 'model-40-of-80.safetensors', 'model-7-of-80.safetensors', 'model-65-of-80.safetensors', 'model-19-of-80.safetensors', 'model-5-of-80.safetensors', 'model-73-of-80.safetensors', 'model-53-of-80.safetensors', 'model-77-of-80.safetensors', 'model-35-of-80.safetensors', 'model-37-of-80.safetensors', 'model-41-of-80.safetensors', 'model-38-of-80.safetensors', 'model-18-of-80.safetensors', 'model-3-of-80.safetensors', 'model-52-of-80.safetensors', 'model-76-of-80.safetensors', 'model-51-of-80.safetensors', 'model-16-of-80.safetensors', 'model-34-of-80.safetensors', 'model-17-of-80.safetensors', 'model-47-of-80.safetensors', 'model-42-of-80.safetensors', 'model-27-of-80.safetensors', 'model-70-of-80.safetensors', 'model-74-of-80.safetensors', 'model-46-of-80.safetensors', 'model-12-of-80.safetensors', 'model-71-of-80.safetensors', 'model-66-of-80.safetensors', 'model-69-of-80.safetensors', 'model-61-of-80.safetensors', 'model-57-of-80.safetensors', 'model-43-of-80.safetensors', 'model-25-of-80.safetensors', 'model-79-of-80.safetensors', 'model-26-of-80.safetensors', 'model-22-of-80.safetensors', 'model-68-of-80.safetensors', 'model-30-of-80.safetensors', 'model-20-of-80.safetensors', 'model-59-of-80.safetensors', 'model-80-of-80.safetensors', 'model-24-of-80.safetensors', 'model-6-of-80.safetensors', 'model-72-of-80.safetensors', 'model-55-of-80.safetensors', 'model-78-of-80.safetensors', 'model-56-of-80.safetensors', 'model-50-of-80.safetensors', 'model-4-of-80.safetensors', 'model-32-of-80.safetensors', 'model-11-of-80.safetensors', 'model-15-of-80.safetensors', 'model-44-of-80.safetensors']] +(Worker_TP3 pid=447) DEBUG 04-22 00:03:11 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-67-of-80.safetensors', 'model-50-of-80.safetensors', 'model-51-of-80.safetensors', 'model-63-of-80.safetensors', 'model-66-of-80.safetensors', 'model-68-of-80.safetensors', 'model-39-of-80.safetensors', 'model-15-of-80.safetensors', 'model-3-of-80.safetensors', 'model-30-of-80.safetensors', 'model-77-of-80.safetensors', 'model-47-of-80.safetensors', 'model-55-of-80.safetensors', 'model-57-of-80.safetensors', 'model-7-of-80.safetensors', 'model-11-of-80.safetensors', 'model-80-of-80.safetensors', 'model-56-of-80.safetensors', 'model-10-of-80.safetensors', 'model-35-of-80.safetensors', 'model-65-of-80.safetensors', 'model-21-of-80.safetensors', 'model-74-of-80.safetensors', 'model-1-of-80.safetensors', 'model-19-of-80.safetensors', 'model-4-of-80.safetensors', 'model-2-of-80.safetensors', 'model-18-of-80.safetensors', 'model-33-of-80.safetensors', 'model-22-of-80.safetensors', 'model-12-of-80.safetensors', 'model-61-of-80.safetensors', 'model-41-of-80.safetensors', 'model-58-of-80.safetensors', 'model-48-of-80.safetensors', 'model-42-of-80.safetensors', 'model-38-of-80.safetensors', 'model-25-of-80.safetensors', 'model-20-of-80.safetensors', 'model-78-of-80.safetensors', 'model-46-of-80.safetensors', 'model-37-of-80.safetensors', 'model-13-of-80.safetensors', 'model-9-of-80.safetensors', 'model-23-of-80.safetensors', 'model-59-of-80.safetensors', 'model-44-of-80.safetensors', 'model-72-of-80.safetensors', 'model-43-of-80.safetensors', 'model-40-of-80.safetensors', 'model-73-of-80.safetensors', 'model-24-of-80.safetensors', 'model-17-of-80.safetensors', 'model-49-of-80.safetensors', 'model-29-of-80.safetensors', 'model-6-of-80.safetensors', 'model-76-of-80.safetensors', 'model-45-of-80.safetensors', 'model-52-of-80.safetensors', 'model-34-of-80.safetensors', 'model-8-of-80.safetensors', 'model-27-of-80.safetensors', 'model-53-of-80.safetensors', 'model-26-of-80.safetensors', 'model-70-of-80.safetensors', 'model-16-of-80.safetensors', 'model-31-of-80.safetensors', 'model-69-of-80.safetensors', 'model-36-of-80.safetensors', 'model-14-of-80.safetensors', 'model-60-of-80.safetensors', 'model-28-of-80.safetensors', 'model-62-of-80.safetensors', 'model-54-of-80.safetensors', 'model-75-of-80.safetensors', 'model-64-of-80.safetensors', 'model-32-of-80.safetensors', 'model-79-of-80.safetensors', 'model-5-of-80.safetensors', 'model-71-of-80.safetensors']] +(Worker_TP0 pid=444) Loading safetensors checkpoint shards: 0% Completed | 0/80 [00:00 +(APIServer pid=1) sys.exit(main()) +(APIServer pid=1) ^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main +(APIServer pid=1) args.dispatch_function(args) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd +(APIServer pid=1) uvloop.run(run_server(args)) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run +(APIServer pid=1) return __asyncio.run( +(APIServer pid=1) ^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run +(APIServer pid=1) return runner.run(main) +(APIServer pid=1) ^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run +(APIServer pid=1) return self._loop.run_until_complete(task) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper +(APIServer pid=1) return await main +(APIServer pid=1) ^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server +(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker +(APIServer pid=1) async with build_async_engine_client( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ +(APIServer pid=1) return await anext(self.gen) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client +(APIServer pid=1) async with build_async_engine_client_from_engine_args( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ +(APIServer pid=1) return await anext(self.gen) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 136, in build_async_engine_client_from_engine_args +(APIServer pid=1) async_llm = AsyncLLM.from_vllm_config( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 225, in from_vllm_config +(APIServer pid=1) return cls( +(APIServer pid=1) ^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 154, in __init__ +(APIServer pid=1) self.engine_core = EngineCoreClient.make_async_mp_client( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(APIServer pid=1) return func(*args, **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 130, in make_async_mp_client +(APIServer pid=1) return AsyncMPClient(*client_args) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(APIServer pid=1) return func(*args, **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 887, in __init__ +(APIServer pid=1) super().__init__( +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 535, in __init__ +(APIServer pid=1) with launch_core_engines( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 144, in __exit__ +(APIServer pid=1) next(self.gen) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 998, in launch_core_engines +(APIServer pid=1) wait_for_engine_startup( +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 1057, in wait_for_engine_startup +(APIServer pid=1) raise RuntimeError( +(APIServer pid=1) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} +/usr/lib/python3.12/multiprocessing/resource_tracker.py:279: UserWarning: resource_tracker: There appear to be 1 leaked shared_memory objects to clean up at shutdown + warnings.warn('resource_tracker: There appear to be %d ' diff --git a/accuracy/results/v0.19.0/logs/moonshotai-kimi-dev-72b--h100-80gb--tp4pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/moonshotai-kimi-dev-72b--h100-80gb--tp4pp1dp1--8192.log new file mode 100644 index 00000000..6b00b529 --- /dev/null +++ b/accuracy/results/v0.19.0/logs/moonshotai-kimi-dev-72b--h100-80gb--tp4pp1dp1--8192.log @@ -0,0 +1,2854 @@ +DEBUG 04-22 01:32:24 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:32:24 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:32:24 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:32:24 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:32:24 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:32:24 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:32:24 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:32:24 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:32:24 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:32:24 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:32:24 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:32:24 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:32:29 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:32:30 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' +DEBUG 04-22 01:32:30 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:32:30 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:32:30 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:32:30 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 01:32:31 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:32:31 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 01:32:31 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 01:32:31 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model moonshotai/Kimi-Dev-72B +(APIServer pid=1) INFO 04-22 01:32:31 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 01:32:31 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:32:31 [entrypoints/utils.py:233] non-default args: {'model_tag': 'moonshotai/Kimi-Dev-72B', 'model': 'moonshotai/Kimi-Dev-72B', 'trust_remote_code': True, 'max_model_len': 8192, 'tensor_parallel_size': 4, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 01:32:31 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 01:32:31 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen2.Qwen2ForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 01:32:31 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0007761 secs +(APIServer pid=1) INFO 04-22 01:32:31 [config/model.py:549] Resolved architecture: Qwen2ForCausalLM +(APIServer pid=1) INFO 04-22 01:32:31 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 01:32:31 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 01:32:31 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 01:32:31 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 01:32:31 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 01:32:31 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 01:32:31 [config/parallel.py:743] Defaulting to use mp for distributed inference +(APIServer pid=1) INFO 04-22 01:32:31 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 01:32:31 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(APIServer pid=1) INFO 04-22 01:32:33 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(APIServer pid=1) DEBUG 04-22 01:32:33 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 01:32:33 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:32:34 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:32:34 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 01:32:37 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:32:37 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:32:37 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:32:37 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:32:38 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:32:38 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:32:38 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:32:38 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:32:38 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:32:38 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:32:38 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:32:38 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:32:42 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=243) DEBUG 04-22 01:32:44 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 01:32:44 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=243) DEBUG 04-22 01:32:44 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/9bc312a7-0b33-4921-9fae-b1f2ff7db368'], outputs=['ipc:///tmp/d3d42416-6cd7-4dcf-9c68-fe976bc2ee76'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=243) DEBUG 04-22 01:32:44 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=243) DEBUG 04-22 01:32:44 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=243) DEBUG 04-22 01:32:44 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=243) DEBUG 04-22 01:32:44 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=243) DEBUG 04-22 01:32:44 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=243) INFO 04-22 01:32:44 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='moonshotai/Kimi-Dev-72B', speculative_config=None, tokenizer='moonshotai/Kimi-Dev-72B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=4, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=moonshotai/Kimi-Dev-72B, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [128, 8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=243) WARNING 04-22 01:32:44 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. +(EngineCore pid=243) INFO 04-22 01:32:44 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.131.3.122 (local), world_size=4, local_world_size=4 +(EngineCore pid=243) DEBUG 04-22 01:32:44 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/a440f459-16ec-4f4b-b25a-28418caf8c99 +(EngineCore pid=243) DEBUG 04-22 01:32:44 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1, 2, 3], buffer_handle=(4, 16777216, 10, 'psm_7e5af658'), local_subscribe_addr='ipc:///tmp/a440f459-16ec-4f4b-b25a-28418caf8c99', local_notify_addr='ipc:///tmp/638a17a4-2ae0-4a34-86c2-b2ff6ae03f64', remote_subscribe_addr=None, remote_addr_ipv6=False) +DEBUG 04-22 01:32:47 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:32:47 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:32:47 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:32:47 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:32:47 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:32:47 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:32:47 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:32:47 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:32:47 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:32:47 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:32:47 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:32:47 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:32:47 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:32:47 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:32:47 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:32:47 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:32:47 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:32:47 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:32:47 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:32:47 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:32:47 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:32:47 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:32:47 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:32:47 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:32:47 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:32:47 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:32:47 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:32:47 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:32:47 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:32:47 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:32:47 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:32:47 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:32:47 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:32:47 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:32:47 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:32:47 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:32:47 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:32:47 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:32:47 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:32:47 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:32:47 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:32:47 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:32:47 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:32:47 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:32:47 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:32:47 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:32:47 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:32:47 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:32:52 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:32:52 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:32:52 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:32:52 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(APIServer pid=1) DEBUG 04-22 01:32:54 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +DEBUG 04-22 01:32:54 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:32:54 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:32:54 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:32:54 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 01:32:54 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:32:54 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:32:54 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:32:54 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 01:32:54 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:32:54 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:32:54 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:32:54 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 01:32:54 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:32:54 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:32:54 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:32:54 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(Worker pid=442) DEBUG 04-22 01:32:55 [distributed/parallel_state.py:1356] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:60879 backend=nccl +(Worker pid=442) INFO 04-22 01:32:55 [distributed/parallel_state.py:1400] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:60879 backend=nccl +(Worker pid=443) DEBUG 04-22 01:32:55 [distributed/parallel_state.py:1356] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:60879 backend=nccl +(Worker pid=443) INFO 04-22 01:32:55 [distributed/parallel_state.py:1400] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:60879 backend=nccl +(Worker pid=445) DEBUG 04-22 01:32:55 [distributed/parallel_state.py:1356] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:60879 backend=nccl +(Worker pid=445) INFO 04-22 01:32:55 [distributed/parallel_state.py:1400] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:60879 backend=nccl +(Worker pid=444) DEBUG 04-22 01:32:55 [distributed/parallel_state.py:1356] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:60879 backend=nccl +(Worker pid=444) INFO 04-22 01:32:55 [distributed/parallel_state.py:1400] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:60879 backend=nccl +[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +(Worker pid=442) DEBUG 04-22 01:32:55 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=444) DEBUG 04-22 01:32:55 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=443) DEBUG 04-22 01:32:55 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=445) DEBUG 04-22 01:32:55 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +(Worker pid=444) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=444) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=443) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=443) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=442) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=442) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=445) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=445) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=442) DEBUG 04-22 01:32:55 [utils/nccl.py:34] Found nccl from library libnccl.so.2 +(Worker pid=442) INFO 04-22 01:32:55 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 +(Worker pid=443) DEBUG 04-22 01:32:56 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=445) DEBUG 04-22 01:32:56 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=442) DEBUG 04-22 01:32:56 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=444) DEBUG 04-22 01:32:56 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=442) DEBUG 04-22 01:32:57 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/2553b6ac-4d99-40c7-8c8e-99b100945ec0 +(Worker pid=442) DEBUG 04-22 01:32:57 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1, 2, 3], buffer_handle=(3, 4194304, 6, 'psm_de4932bf'), local_subscribe_addr='ipc:///tmp/2553b6ac-4d99-40c7-8c8e-99b100945ec0', local_notify_addr='ipc:///tmp/153501ca-24ac-4494-b023-cabdee1b39ce', remote_subscribe_addr=None, remote_addr_ipv6=False) +(Worker pid=444) DEBUG 04-22 01:32:57 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/2553b6ac-4d99-40c7-8c8e-99b100945ec0 +(Worker pid=443) DEBUG 04-22 01:32:57 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/2553b6ac-4d99-40c7-8c8e-99b100945ec0 +(Worker pid=445) DEBUG 04-22 01:32:57 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/2553b6ac-4d99-40c7-8c8e-99b100945ec0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(Worker pid=442) INFO 04-22 01:32:57 [distributed/parallel_state.py:1716] rank 0 in world size 4 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(Worker pid=442) DEBUG 04-22 01:32:57 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776821577.3190374, auto_measure=True +(Worker pid=442) DEBUG 04-22 01:32:57 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=443) DEBUG 04-22 01:32:57 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776821577.3577726, auto_measure=True +(Worker pid=443) DEBUG 04-22 01:32:57 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=444) DEBUG 04-22 01:32:57 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776821577.3819945, auto_measure=True +(Worker pid=444) DEBUG 04-22 01:32:57 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=445) DEBUG 04-22 01:32:57 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776821577.4167647, auto_measure=True +(Worker pid=445) DEBUG 04-22 01:32:57 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=442) DEBUG 04-22 01:32:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=442) DEBUG 04-22 01:32:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=442) DEBUG 04-22 01:32:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=443) DEBUG 04-22 01:32:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=443) DEBUG 04-22 01:32:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=442) DEBUG 04-22 01:32:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=442) DEBUG 04-22 01:32:57 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(Worker pid=443) DEBUG 04-22 01:32:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=442) DEBUG 04-22 01:32:57 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=444) DEBUG 04-22 01:32:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=444) DEBUG 04-22 01:32:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=444) DEBUG 04-22 01:32:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=443) DEBUG 04-22 01:32:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=445) DEBUG 04-22 01:32:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=445) DEBUG 04-22 01:32:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=443) DEBUG 04-22 01:32:57 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=445) DEBUG 04-22 01:32:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=444) DEBUG 04-22 01:32:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=442) DEBUG 04-22 01:32:57 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(Worker pid=444) DEBUG 04-22 01:32:57 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker_TP0 pid=442) INFO 04-22 01:32:57 [v1/worker/gpu_model_runner.py:4735] Starting to load model moonshotai/Kimi-Dev-72B... +(Worker pid=445) DEBUG 04-22 01:32:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=445) DEBUG 04-22 01:32:57 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker_TP0 pid=442) DEBUG 04-22 01:32:57 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(Worker_TP0 pid=442) INFO 04-22 01:32:57 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(Worker_TP0 pid=442) INFO 04-22 01:32:57 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(Worker_TP0 pid=442) DEBUG 04-22 01:32:58 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP0 pid=442) DEBUG 04-22 01:32:58 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP0 pid=442) DEBUG 04-22 01:32:58 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP0 pid=442) DEBUG 04-22 01:32:58 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP1 pid=443) DEBUG 04-22 01:32:58 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP1 pid=443) DEBUG 04-22 01:32:58 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP1 pid=443) DEBUG 04-22 01:32:58 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP1 pid=443) DEBUG 04-22 01:32:58 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP3 pid=445) DEBUG 04-22 01:32:58 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP3 pid=445) DEBUG 04-22 01:32:58 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP3 pid=445) DEBUG 04-22 01:32:58 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP3 pid=445) DEBUG 04-22 01:32:58 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP2 pid=444) DEBUG 04-22 01:32:58 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP2 pid=444) DEBUG 04-22 01:32:58 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP2 pid=444) DEBUG 04-22 01:32:58 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP2 pid=444) DEBUG 04-22 01:32:58 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP0 pid=442) DEBUG 04-22 01:32:58 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-22-of-80.safetensors', 'model-32-of-80.safetensors', 'model-46-of-80.safetensors', 'model-36-of-80.safetensors', 'model-53-of-80.safetensors', 'model-29-of-80.safetensors', 'model-8-of-80.safetensors', 'model-4-of-80.safetensors', 'model-24-of-80.safetensors', 'model-2-of-80.safetensors', 'model-12-of-80.safetensors', 'model-59-of-80.safetensors', 'model-11-of-80.safetensors', 'model-45-of-80.safetensors', 'model-72-of-80.safetensors', 'model-13-of-80.safetensors', 'model-51-of-80.safetensors', 'model-33-of-80.safetensors', 'model-1-of-80.safetensors', 'model-67-of-80.safetensors', 'model-30-of-80.safetensors', 'model-74-of-80.safetensors', 'model-80-of-80.safetensors', 'model-76-of-80.safetensors', 'model-26-of-80.safetensors', 'model-71-of-80.safetensors', 'model-44-of-80.safetensors', 'model-31-of-80.safetensors', 'model-78-of-80.safetensors', 'model-57-of-80.safetensors', 'model-6-of-80.safetensors', 'model-39-of-80.safetensors', 'model-7-of-80.safetensors', 'model-16-of-80.safetensors', 'model-28-of-80.safetensors', 'model-42-of-80.safetensors', 'model-56-of-80.safetensors', 'model-79-of-80.safetensors', 'model-61-of-80.safetensors', 'model-3-of-80.safetensors', 'model-38-of-80.safetensors', 'model-21-of-80.safetensors', 'model-5-of-80.safetensors', 'model-68-of-80.safetensors', 'model-25-of-80.safetensors', 'model-14-of-80.safetensors', 'model-62-of-80.safetensors', 'model-65-of-80.safetensors', 'model-40-of-80.safetensors', 'model-73-of-80.safetensors', 'model-23-of-80.safetensors', 'model-15-of-80.safetensors', 'model-19-of-80.safetensors', 'model-43-of-80.safetensors', 'model-35-of-80.safetensors', 'model-47-of-80.safetensors', 'model-58-of-80.safetensors', 'model-18-of-80.safetensors', 'model-27-of-80.safetensors', 'model-17-of-80.safetensors', 'model-52-of-80.safetensors', 'model-55-of-80.safetensors', 'model-63-of-80.safetensors', 'model-34-of-80.safetensors', 'model-60-of-80.safetensors', 'model-64-of-80.safetensors', 'model-49-of-80.safetensors', 'model-50-of-80.safetensors', 'model-75-of-80.safetensors', 'model-69-of-80.safetensors', 'model-77-of-80.safetensors', 'model-48-of-80.safetensors', 'model-20-of-80.safetensors', 'model-41-of-80.safetensors', 'model-10-of-80.safetensors', 'model-70-of-80.safetensors', 'model-66-of-80.safetensors', 'model-37-of-80.safetensors', 'model-9-of-80.safetensors', 'model-54-of-80.safetensors']] +(Worker_TP0 pid=442) Loading safetensors checkpoint shards: 0% Completed | 0/80 [00:00 +(Worker_TP1 pid=443) DEBUG 04-22 01:34:04 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=444) DEBUG 04-22 01:34:04 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=443) DEBUG 04-22 01:34:04 [compilation/decorators.py:528] Start compiling function +(Worker_TP2 pid=444) DEBUG 04-22 01:34:04 [compilation/decorators.py:528] Start compiling function +(Worker_TP0 pid=442) DEBUG 04-22 01:34:04 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) DEBUG 04-22 01:34:04 [compilation/decorators.py:528] Start compiling function +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP0 pid=442) INFO 04-22 01:34:13 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/6cf3b9ba99/rank_0_0/backbone for vLLM's torch.compile +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=fc91086a3c comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/6cf3b9ba99/rank_0_0/backbone +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] Vllm config hash: fc91086a3c +(Worker_TP0 pid=442) INFO 04-22 01:34:13 [compilation/backends.py:1111] Dynamo bytecode transform time: 9.27 s +(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/.../fusion/allreduce_rms_fusion.py:765] Flashinfer max size: 2 MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: 128 +(Worker_TP0 pid=442) INFO 04-22 01:34:13 [distributed/device_communicators/flashinfer_all_reduce.py:109] Auto-selected flashinfer allreduce backend: trtllm +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=fc91086a3c comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/6cf3b9ba99/rank_1_0/backbone +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] Vllm config hash: fc91086a3c +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=fc91086a3c comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/6cf3b9ba99/rank_2_0/backbone +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] Vllm config hash: fc91086a3c +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=fc91086a3c comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/6cf3b9ba99/rank_3_0/backbone +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] Vllm config hash: fc91086a3c +(APIServer pid=1) DEBUG 04-22 01:34:14 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=442) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. +(Worker_TP0 pid=442) return func(*args, **kwargs) +(Worker_TP1 pid=443) DEBUG 04-22 01:34:14 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=1, max_token_num=128, hidden_dim=8192, dtype=torch.bfloat16 +(Worker_TP0 pid=442) DEBUG 04-22 01:34:14 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=0, max_token_num=128, hidden_dim=8192, dtype=torch.bfloat16 +(Worker_TP3 pid=445) DEBUG 04-22 01:34:14 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=3, max_token_num=128, hidden_dim=8192, dtype=torch.bfloat16 +(Worker_TP2 pid=444) DEBUG 04-22 01:34:14 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=2, max_token_num=128, hidden_dim=8192, dtype=torch.bfloat16 +(Worker_TP0 pid=442) INFO 04-22 01:34:14 [distributed/device_communicators/flashinfer_all_reduce.py:149] Initialized FlashInfer Allreduce norm fusion workspace with backend=trtllm +(Worker_TP0 pid=442) DEBUG 04-22 01:34:15 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 128), (129, 8192)] +(Worker_TP0 pid=442) DEBUG 04-22 01:34:15 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(Worker_TP0 pid=442) DEBUG 04-22 01:34:16 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=442) DEBUG 04-22 01:34:16 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP0 pid=442) DEBUG 04-22 01:34:16 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns +(Worker_TP0 pid=442) DEBUG 04-22 01:34:16 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 35.9 ms +(Worker_TP0 pid=442) DEBUG 04-22 01:34:16 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=442) DEBUG 04-22 01:34:16 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes +(Worker_TP0 pid=442) DEBUG 04-22 01:34:16 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP1 pid=443) DEBUG 04-22 01:34:16 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=443) DEBUG 04-22 01:34:16 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP2 pid=444) DEBUG 04-22 01:34:16 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP2 pid=444) DEBUG 04-22 01:34:16 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP1 pid=443) DEBUG 04-22 01:34:16 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns +(Worker_TP1 pid=443) DEBUG 04-22 01:34:16 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 35.5 ms +(Worker_TP1 pid=443) DEBUG 04-22 01:34:16 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=443) DEBUG 04-22 01:34:16 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes +(Worker_TP1 pid=443) DEBUG 04-22 01:34:16 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP3 pid=445) DEBUG 04-22 01:34:16 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP2 pid=444) DEBUG 04-22 01:34:16 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns +(Worker_TP3 pid=445) DEBUG 04-22 01:34:16 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms +(Worker_TP2 pid=444) DEBUG 04-22 01:34:16 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 36.5 ms +(Worker_TP2 pid=444) DEBUG 04-22 01:34:16 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP2 pid=444) DEBUG 04-22 01:34:16 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes +(Worker_TP2 pid=444) DEBUG 04-22 01:34:16 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP3 pid=445) DEBUG 04-22 01:34:16 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns +(Worker_TP3 pid=445) DEBUG 04-22 01:34:16 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.6 ms +(Worker_TP3 pid=445) DEBUG 04-22 01:34:16 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP3 pid=445) DEBUG 04-22 01:34:16 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes +(Worker_TP3 pid=445) DEBUG 04-22 01:34:16 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.7 ms +(Worker_TP0 pid=442) INFO 04-22 01:34:18 [compilation/backends.py:372] Cache the graph of compile range (1, 128) for later use +(Worker_TP0 pid=442) DEBUG 04-22 01:34:18 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 128) from inductor_standalone via handle ('artifact_compile_range_1_128_subgraph_0', '/data/.cache/vllm/torch_compile_cache/6cf3b9ba99/rank_0_0/backbone/artifact_compile_range_1_128_subgraph_0') +(Worker_TP0 pid=442) DEBUG 04-22 01:34:19 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=442) DEBUG 04-22 01:34:19 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP0 pid=442) DEBUG 04-22 01:34:19 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) +(Worker_TP0 pid=442) DEBUG 04-22 01:34:19 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP0 pid=442) DEBUG 04-22 01:34:19 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=442) DEBUG 04-22 01:34:19 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP2 pid=444) DEBUG 04-22 01:34:19 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP2 pid=444) DEBUG 04-22 01:34:19 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.1 ms +(Worker_TP2 pid=444) DEBUG 04-22 01:34:19 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) +(Worker_TP2 pid=444) DEBUG 04-22 01:34:19 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.5 ms +(Worker_TP2 pid=444) DEBUG 04-22 01:34:19 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP2 pid=444) DEBUG 04-22 01:34:19 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP3 pid=445) DEBUG 04-22 01:34:19 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP3 pid=445) DEBUG 04-22 01:34:19 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP3 pid=445) DEBUG 04-22 01:34:19 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) +(Worker_TP3 pid=445) DEBUG 04-22 01:34:19 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP3 pid=445) DEBUG 04-22 01:34:19 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP3 pid=445) DEBUG 04-22 01:34:19 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=442) INFO 04-22 01:34:20 [compilation/backends.py:372] Cache the graph of compile range (129, 8192) for later use +(Worker_TP0 pid=442) DEBUG 04-22 01:34:20 [compilation/backends.py:377] Store the 0-th graph for compile range(129, 8192) from inductor_standalone via handle ('artifact_compile_range_129_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/6cf3b9ba99/rank_0_0/backbone/artifact_compile_range_129_8192_subgraph_0') +(Worker_TP1 pid=443) DEBUG 04-22 01:34:20 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=443) DEBUG 04-22 01:34:20 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP1 pid=443) DEBUG 04-22 01:34:20 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) +(Worker_TP1 pid=443) DEBUG 04-22 01:34:20 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=443) DEBUG 04-22 01:34:20 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=443) DEBUG 04-22 01:34:20 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=442) DEBUG 04-22 01:34:20 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=442) DEBUG 04-22 01:34:20 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP0 pid=442) DEBUG 04-22 01:34:20 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP0 pid=442) DEBUG 04-22 01:34:20 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 65.8 ms +(Worker_TP0 pid=442) DEBUG 04-22 01:34:20 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=442) DEBUG 04-22 01:34:20 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP0 pid=442) DEBUG 04-22 01:34:20 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP2 pid=444) DEBUG 04-22 01:34:20 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP2 pid=444) DEBUG 04-22 01:34:20 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms +(Worker_TP2 pid=444) DEBUG 04-22 01:34:20 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP2 pid=444) DEBUG 04-22 01:34:20 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 66.3 ms +(Worker_TP2 pid=444) DEBUG 04-22 01:34:20 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP2 pid=444) DEBUG 04-22 01:34:20 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP2 pid=444) DEBUG 04-22 01:34:20 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.6 ms +(Worker_TP3 pid=445) DEBUG 04-22 01:34:20 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP3 pid=445) DEBUG 04-22 01:34:20 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP3 pid=445) DEBUG 04-22 01:34:20 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP3 pid=445) DEBUG 04-22 01:34:20 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 68.0 ms +(Worker_TP3 pid=445) DEBUG 04-22 01:34:20 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP3 pid=445) DEBUG 04-22 01:34:20 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP3 pid=445) DEBUG 04-22 01:34:20 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.6 ms +(Worker_TP0 pid=442) DEBUG 04-22 01:34:21 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 128) from inductor_standalone via handle ('artifact_compile_range_1_128_subgraph_1', '/data/.cache/vllm/torch_compile_cache/6cf3b9ba99/rank_0_0/backbone/artifact_compile_range_1_128_subgraph_1') +(Worker_TP1 pid=443) DEBUG 04-22 01:34:21 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=443) DEBUG 04-22 01:34:21 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP1 pid=443) DEBUG 04-22 01:34:21 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP1 pid=443) DEBUG 04-22 01:34:21 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 68.8 ms +(Worker_TP1 pid=443) DEBUG 04-22 01:34:21 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=443) DEBUG 04-22 01:34:21 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP1 pid=443) DEBUG 04-22 01:34:21 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP0 pid=442) DEBUG 04-22 01:34:21 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=442) DEBUG 04-22 01:34:21 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP0 pid=442) DEBUG 04-22 01:34:21 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) +(Worker_TP0 pid=442) DEBUG 04-22 01:34:21 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.7 ms +(Worker_TP0 pid=442) DEBUG 04-22 01:34:21 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=442) DEBUG 04-22 01:34:21 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP2 pid=444) DEBUG 04-22 01:34:21 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP2 pid=444) DEBUG 04-22 01:34:21 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP2 pid=444) DEBUG 04-22 01:34:21 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) +(Worker_TP2 pid=444) DEBUG 04-22 01:34:21 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP2 pid=444) DEBUG 04-22 01:34:21 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP2 pid=444) DEBUG 04-22 01:34:21 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP3 pid=445) DEBUG 04-22 01:34:21 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP3 pid=445) DEBUG 04-22 01:34:21 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP3 pid=445) DEBUG 04-22 01:34:21 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) +(Worker_TP3 pid=445) DEBUG 04-22 01:34:21 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP3 pid=445) DEBUG 04-22 01:34:21 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP3 pid=445) DEBUG 04-22 01:34:21 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP1 pid=443) DEBUG 04-22 01:34:22 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=443) DEBUG 04-22 01:34:22 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.1 ms +(Worker_TP1 pid=443) DEBUG 04-22 01:34:22 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) +(Worker_TP1 pid=443) DEBUG 04-22 01:34:22 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP1 pid=443) DEBUG 04-22 01:34:22 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=443) DEBUG 04-22 01:34:22 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=442) DEBUG 04-22 01:34:23 [compilation/backends.py:377] Store the 1-th graph for compile range(129, 8192) from inductor_standalone via handle ('artifact_compile_range_129_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/6cf3b9ba99/rank_0_0/backbone/artifact_compile_range_129_8192_subgraph_1') +(APIServer pid=1) DEBUG 04-22 01:34:24 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=442) DEBUG 04-22 01:34:29 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=442) DEBUG 04-22 01:34:29 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP0 pid=442) DEBUG 04-22 01:34:29 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP0 pid=442) DEBUG 04-22 01:34:29 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 67.7 ms +(Worker_TP0 pid=442) DEBUG 04-22 01:34:29 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP0 pid=442) DEBUG 04-22 01:34:29 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP0 pid=442) DEBUG 04-22 01:34:29 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=442) DEBUG 04-22 01:34:29 [compilation/backends.py:377] Store the 80-th graph for compile range(1, 128) from inductor_standalone via handle ('artifact_compile_range_1_128_subgraph_80', '/data/.cache/vllm/torch_compile_cache/6cf3b9ba99/rank_0_0/backbone/artifact_compile_range_1_128_subgraph_80') +(Worker_TP0 pid=442) INFO 04-22 01:34:29 [compilation/backends.py:390] Compiling a graph for compile range (1, 128) takes 10.62 s +(Worker_TP2 pid=444) DEBUG 04-22 01:34:29 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP2 pid=444) DEBUG 04-22 01:34:29 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms +(Worker_TP0 pid=442) DEBUG 04-22 01:34:29 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=442) DEBUG 04-22 01:34:29 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP0 pid=442) DEBUG 04-22 01:34:29 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) +(Worker_TP0 pid=442) DEBUG 04-22 01:34:29 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP0 pid=442) DEBUG 04-22 01:34:29 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=442) DEBUG 04-22 01:34:29 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(Worker_TP2 pid=444) DEBUG 04-22 01:34:29 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP2 pid=444) DEBUG 04-22 01:34:29 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 66.9 ms +(Worker_TP2 pid=444) DEBUG 04-22 01:34:29 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP2 pid=444) DEBUG 04-22 01:34:29 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP2 pid=444) DEBUG 04-22 01:34:29 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP2 pid=444) DEBUG 04-22 01:34:30 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP2 pid=444) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP2 pid=444) DEBUG 04-22 01:34:30 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) +(Worker_TP2 pid=444) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP2 pid=444) DEBUG 04-22 01:34:30 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP2 pid=444) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP0 pid=442) DEBUG 04-22 01:34:30 [compilation/backends.py:377] Store the 80-th graph for compile range(129, 8192) from inductor_standalone via handle ('artifact_compile_range_129_8192_subgraph_80', '/data/.cache/vllm/torch_compile_cache/6cf3b9ba99/rank_0_0/backbone/artifact_compile_range_129_8192_subgraph_80') +(Worker_TP0 pid=442) INFO 04-22 01:34:30 [compilation/backends.py:390] Compiling a graph for compile range (129, 8192) takes 11.54 s +(Worker_TP1 pid=443) DEBUG 04-22 01:34:30 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=443) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP3 pid=445) DEBUG 04-22 01:34:30 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP3 pid=445) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP1 pid=443) DEBUG 04-22 01:34:30 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP1 pid=443) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 67.8 ms +(Worker_TP1 pid=443) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms +(Worker_TP1 pid=443) DEBUG 04-22 01:34:30 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP1 pid=443) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP3 pid=445) DEBUG 04-22 01:34:30 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP3 pid=445) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 65.4 ms +(Worker_TP3 pid=445) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP3 pid=445) DEBUG 04-22 01:34:30 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP3 pid=445) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=442) DEBUG 04-22 01:34:30 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/6cf3b9ba99/rank_0_0/backbone/computation_graph.py +(Worker_TP1 pid=443) DEBUG 04-22 01:34:30 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=443) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(Worker_TP1 pid=443) DEBUG 04-22 01:34:30 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) +(Worker_TP1 pid=443) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.6 ms +(Worker_TP1 pid=443) DEBUG 04-22 01:34:30 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=443) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(Worker_TP3 pid=445) DEBUG 04-22 01:34:30 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP3 pid=445) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP3 pid=445) DEBUG 04-22 01:34:30 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) +(Worker_TP3 pid=445) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP3 pid=445) DEBUG 04-22 01:34:30 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP3 pid=445) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP0 pid=442) INFO 04-22 01:34:33 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/0fb79f10c474cc267fe2b1b1648d8efdb738b7e56acfe7e43c9b23ddaa70d3c4/rank_0_0/model +(Worker_TP0 pid=442) INFO 04-22 01:34:33 [compilation/monitor.py:48] torch.compile took 28.92 s in total +(APIServer pid=1) DEBUG 04-22 01:34:34 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=442) INFO 04-22 01:34:35 [compilation/monitor.py:76] Initial profiling/warmup run took 2.25 s +(Worker_TP0 pid=442) INFO 04-22 01:34:41 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP0 pid=442) DEBUG 04-22 01:34:41 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP0 pid=442) INFO 04-22 01:34:41 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_TP3 pid=445) INFO 04-22 01:34:41 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP3 pid=445) DEBUG 04-22 01:34:41 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP3 pid=445) INFO 04-22 01:34:41 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_TP1 pid=443) INFO 04-22 01:34:41 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP1 pid=443) DEBUG 04-22 01:34:41 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP1 pid=443) INFO 04-22 01:34:41 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_TP0 pid=442) DEBUG 04-22 01:34:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=445) DEBUG 04-22 01:34:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=443) DEBUG 04-22 01:34:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=444) INFO 04-22 01:34:42 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP2 pid=444) DEBUG 04-22 01:34:42 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP2 pid=444) INFO 04-22 01:34:42 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_TP2 pid=444) DEBUG 04-22 01:34:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) DEBUG 04-22 01:34:43 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP3 pid=445) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=445) DEBUG 04-22 01:34:43 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=443) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=443) DEBUG 04-22 01:34:43 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP2 pid=444) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=444) DEBUG 04-22 01:34:43 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=442) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=445) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=443) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=444) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) DEBUG 04-22 01:34:43 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP3 pid=445) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=445) DEBUG 04-22 01:34:43 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=443) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=443) DEBUG 04-22 01:34:43 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP2 pid=444) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=444) DEBUG 04-22 01:34:43 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=442) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 132.00 MiB first-capture + (51-1) × 16.00 MiB per-graph +(Worker_TP0 pid=442) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=445) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 132.00 MiB first-capture + (51-1) × 16.00 MiB per-graph +(Worker_TP3 pid=445) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=443) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 132.00 MiB first-capture + (51-1) × 16.00 MiB per-graph +(Worker_TP1 pid=443) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=444) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 132.00 MiB first-capture + (51-1) × 16.00 MiB per-graph +(Worker_TP2 pid=444) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=443) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=445) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=443) DEBUG 04-22 01:34:43 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=442) DEBUG 04-22 01:34:43 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP3 pid=445) DEBUG 04-22 01:34:43 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP2 pid=444) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=444) DEBUG 04-22 01:34:43 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP3 pid=445) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=443) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=444) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=445) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=445) DEBUG 04-22 01:34:43 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=442) DEBUG 04-22 01:34:43 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=443) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=443) DEBUG 04-22 01:34:43 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP2 pid=444) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=444) DEBUG 04-22 01:34:43 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=442) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 142.00 MiB first-capture + (51-1) × 12.00 MiB per-graph +(Worker_TP3 pid=445) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 142.00 MiB first-capture + (51-1) × 12.00 MiB per-graph +(Worker_TP0 pid=442) INFO 04-22 01:34:43 [distributed/device_communicators/custom_all_reduce.py:216] Registering 322 cuda graph addresses +(Worker_TP3 pid=445) INFO 04-22 01:34:43 [distributed/device_communicators/custom_all_reduce.py:216] Registering 322 cuda graph addresses +(Worker_TP1 pid=443) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 142.00 MiB first-capture + (51-1) × 12.00 MiB per-graph +(Worker_TP1 pid=443) INFO 04-22 01:34:43 [distributed/device_communicators/custom_all_reduce.py:216] Registering 322 cuda graph addresses +(Worker_TP2 pid=444) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 142.00 MiB first-capture + (51-1) × 12.00 MiB per-graph +(Worker_TP2 pid=444) INFO 04-22 01:34:43 [distributed/device_communicators/custom_all_reduce.py:216] Registering 322 cuda graph addresses +(APIServer pid=1) DEBUG 04-22 01:34:44 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=442) DEBUG 04-22 01:34:44 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP0 pid=442) INFO 04-22 01:34:44 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.51 GiB total +(Worker_TP1 pid=443) DEBUG 04-22 01:34:44 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP1 pid=443) INFO 04-22 01:34:44 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.51 GiB total +(Worker_TP3 pid=445) DEBUG 04-22 01:34:44 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP3 pid=445) INFO 04-22 01:34:44 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.51 GiB total +(Worker_TP2 pid=444) DEBUG 04-22 01:34:44 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP2 pid=444) INFO 04-22 01:34:44 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.51 GiB total +(Worker_TP0 pid=442) DEBUG 04-22 01:34:44 [v1/worker/gpu_worker.py:424] Initial free memory: 77.64 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP0 pid=442) DEBUG 04-22 01:34:44 [v1/worker/gpu_worker.py:430] Free memory after profiling: 40.54 GiB (total), 38.13 GiB (within requested) +(Worker_TP0 pid=442) DEBUG 04-22 01:34:44 [v1/worker/gpu_worker.py:435] Memory profiling takes 40.40 seconds. Total non KV cache memory: 38.51GiB; torch peak memory increase: 2.29GiB; non-torch forward increase memory: 2.21GiB; weights memory: 34.01GiB. +(Worker_TP0 pid=442) INFO 04-22 01:34:44 [v1/worker/gpu_worker.py:436] Available KV cache memory: 36.72 GiB +(Worker_TP0 pid=442) INFO 04-22 01:34:44 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9690 to maintain the same effective KV cache size. +(Worker_TP0 pid=442) DEBUG 04-22 01:34:44 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=243) DEBUG 04-22 01:34:44 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=243) DEBUG 04-22 01:34:44 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP3 pid=445) DEBUG 04-22 01:34:44 [v1/worker/gpu_worker.py:424] Initial free memory: 77.64 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP3 pid=445) DEBUG 04-22 01:34:44 [v1/worker/gpu_worker.py:430] Free memory after profiling: 40.54 GiB (total), 38.13 GiB (within requested) +(Worker_TP3 pid=445) DEBUG 04-22 01:34:44 [v1/worker/gpu_worker.py:435] Memory profiling takes 40.46 seconds. Total non KV cache memory: 38.51GiB; torch peak memory increase: 2.29GiB; non-torch forward increase memory: 2.21GiB; weights memory: 34.01GiB. +(Worker_TP3 pid=445) INFO 04-22 01:34:44 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9690 to maintain the same effective KV cache size. +(Worker_TP3 pid=445) DEBUG 04-22 01:34:44 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=443) DEBUG 04-22 01:34:44 [v1/worker/gpu_worker.py:424] Initial free memory: 77.64 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP1 pid=443) DEBUG 04-22 01:34:44 [v1/worker/gpu_worker.py:430] Free memory after profiling: 40.54 GiB (total), 38.13 GiB (within requested) +(Worker_TP1 pid=443) DEBUG 04-22 01:34:44 [v1/worker/gpu_worker.py:435] Memory profiling takes 40.43 seconds. Total non KV cache memory: 38.51GiB; torch peak memory increase: 2.29GiB; non-torch forward increase memory: 2.21GiB; weights memory: 34.01GiB. +(Worker_TP1 pid=443) INFO 04-22 01:34:44 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9690 to maintain the same effective KV cache size. +(Worker_TP1 pid=443) DEBUG 04-22 01:34:44 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=243) DEBUG 04-22 01:34:44 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=243) DEBUG 04-22 01:34:44 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP2 pid=444) DEBUG 04-22 01:34:44 [v1/worker/gpu_worker.py:424] Initial free memory: 77.64 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP2 pid=444) DEBUG 04-22 01:34:44 [v1/worker/gpu_worker.py:430] Free memory after profiling: 40.54 GiB (total), 38.13 GiB (within requested) +(Worker_TP2 pid=444) DEBUG 04-22 01:34:44 [v1/worker/gpu_worker.py:435] Memory profiling takes 40.54 seconds. Total non KV cache memory: 38.51GiB; torch peak memory increase: 2.29GiB; non-torch forward increase memory: 2.21GiB; weights memory: 34.01GiB. +(Worker_TP2 pid=444) INFO 04-22 01:34:44 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9690 to maintain the same effective KV cache size. +(Worker_TP2 pid=444) DEBUG 04-22 01:34:44 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=243) DEBUG 04-22 01:34:44 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=243) INFO 04-22 01:34:44 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 481,296 tokens +(EngineCore pid=243) INFO 04-22 01:34:44 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 58.75x +(Worker_TP3 pid=445) DEBUG 04-22 01:34:44 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP2 pid=444) DEBUG 04-22 01:34:44 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=442) DEBUG 04-22 01:34:44 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=443) DEBUG 04-22 01:34:44 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP3 pid=445) 2026-04-22 01:34:44,946 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP1 pid=443) 2026-04-22 01:34:44,947 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP0 pid=442) 2026-04-22 01:34:44,947 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP2 pid=444) 2026-04-22 01:34:44,950 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP3 pid=445) DEBUG 04-22 01:34:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=443) DEBUG 04-22 01:34:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=444) DEBUG 04-22 01:34:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) DEBUG 04-22 01:34:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=445) 2026-04-22 01:34:45,053 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP0 pid=442) 2026-04-22 01:34:45,053 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP1 pid=443) 2026-04-22 01:34:45,053 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP2 pid=444) 2026-04-22 01:34:45,053 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP3 pid=445) DEBUG 04-22 01:34:45 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=243) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=243) INFO 04-22 01:34:57 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(Worker_TP1 pid=443) DEBUG 04-22 01:34:57 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=442) DEBUG 04-22 01:34:57 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP2 pid=444) DEBUG 04-22 01:34:57 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP3 pid=445) DEBUG 04-22 01:34:57 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=243) DEBUG 04-22 01:34:57 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=243) DEBUG 04-22 01:34:57 [v1/engine/core.py:1158] EngineCore waiting for work. +(EngineCore pid=243) DEBUG 04-22 01:34:57 [v1/engine/core.py:1158] EngineCore waiting for work. +(APIServer pid=1) INFO 04-22 01:34:57 [entrypoints/openai/api_server.py:590] Supported tasks: ['generate'] +(Worker_TP3 pid=445) DEBUG 04-22 01:34:58 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=442) DEBUG 04-22 01:34:58 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP2 pid=444) DEBUG 04-22 01:34:58 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=443) DEBUG 04-22 01:34:58 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(APIServer pid=1) DEBUG 04-22 01:34:59 [renderers/base.py:197] Warming up chat template processing... +(APIServer pid=1) DEBUG 04-22 01:34:59 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e825c3-262cbbf84c1596706b620aed;550b368f-0e76-44a4-a224-60910a22ce88) +(APIServer pid=1) DEBUG 04-22 01:34:59 [transformers_utils/repo_utils.py:243] +(APIServer pid=1) DEBUG 04-22 01:34:59 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/moonshotai/Kimi-Dev-72B/resolve/main/processor_config.json. +(APIServer pid=1) DEBUG 04-22 01:34:59 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e825c3-70eafc0c3e5c2e900043d0c7;f53b034b-cfa3-49f9-be82-3d60dba5c7b9) +(APIServer pid=1) DEBUG 04-22 01:34:59 [transformers_utils/repo_utils.py:243] +(APIServer pid=1) DEBUG 04-22 01:34:59 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/moonshotai/Kimi-Dev-72B/resolve/main/preprocessor_config.json. +(APIServer pid=1) INFO 04-22 01:35:01 [renderers/hf.py:314] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this. +(APIServer pid=1) DEBUG 04-22 01:35:01 [renderers/base.py:203] Chat template warmup completed in 1.285s +(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/openai/api_server.py:594] Starting vLLM server on http://0.0.0.0:8000 +(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:37] Available routes are: +(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /openapi.json, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /docs, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /docs/oauth2-redirect, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /redoc, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /tokenize, Methods: POST +(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /detokenize, Methods: POST +(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /load, Methods: GET +(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /version, Methods: GET +(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /health, Methods: GET +(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /metrics, Methods: GET +(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /v1/models, Methods: GET +(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /ping, Methods: GET +(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /ping, Methods: POST +(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /invocations, Methods: POST +(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /v1/chat/completions, Methods: POST +(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST +(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /v1/responses, Methods: POST +(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET +(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST +(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /v1/completions, Methods: POST +(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /v1/messages, Methods: POST +(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST +(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /inference/v1/generate, Methods: POST +(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /scale_elastic_ep, Methods: POST +(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST +(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /v1/chat/completions/render, Methods: POST +(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /v1/completions/render, Methods: POST +(APIServer pid=1) INFO: Started server process [1] +(APIServer pid=1) INFO: Waiting for application startup. +(APIServer pid=1) INFO: Application startup complete. +(APIServer pid=1) DEBUG 04-22 01:35:09 [v1/engine/async_llm.py:875] Called check_health. +(APIServer pid=1) INFO: 10.131.2.2:45350 - "GET /health HTTP/1.1" 200 OK diff --git a/accuracy/results/v0.19.0/logs/moonshotai-kimi-vl-a3b-i--h100-80gb--tp1pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/moonshotai-kimi-vl-a3b-i--h100-80gb--tp1pp1dp1--8192.log new file mode 100644 index 00000000..ee8f5d9f --- /dev/null +++ b/accuracy/results/v0.19.0/logs/moonshotai-kimi-vl-a3b-i--h100-80gb--tp1pp1dp1--8192.log @@ -0,0 +1,1126 @@ +DEBUG 04-21 23:53:06 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-21 23:53:06 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-21 23:53:06 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-21 23:53:06 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-21 23:53:06 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-21 23:53:06 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-21 23:53:06 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-21 23:53:06 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-21 23:53:06 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-21 23:53:06 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-21 23:53:06 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-21 23:53:06 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-21 23:53:11 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-21 23:53:13 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' +DEBUG 04-21 23:53:13 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-21 23:53:13 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-21 23:53:13 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-21 23:53:13 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-21 23:53:13 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-21 23:53:13 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-21 23:53:13 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-21 23:53:13 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model moonshotai/Kimi-VL-A3B-Instruct +(APIServer pid=1) INFO 04-21 23:53:13 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-21 23:53:13 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-21 23:53:13 [entrypoints/utils.py:233] non-default args: {'model_tag': 'moonshotai/Kimi-VL-A3B-Instruct', 'model': 'moonshotai/Kimi-VL-A3B-Instruct', 'trust_remote_code': True, 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-21 23:53:13 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-21 23:53:14 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.kimi_vl.KimiVLForConditionalGeneration from cache +(APIServer pid=1) DEBUG 04-21 23:53:14 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0025763 secs +(APIServer pid=1) INFO 04-21 23:53:14 [config/model.py:549] Resolved architecture: KimiVLForConditionalGeneration +(APIServer pid=1) INFO 04-21 23:53:14 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-21 23:53:14 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-21 23:53:14 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-21 23:53:14 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-21 23:53:14 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-21 23:53:14 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-21 23:53:14 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-21 23:53:14 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-21 23:53:14 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-21 23:53:14 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) WARNING 04-21 23:53:14 [tokenizers/registry.py:212] Using a slow tokenizer. This might cause a significant slowdown. Consider using a fast tokenizer instead. +(APIServer pid=1) DEBUG 04-21 23:53:14 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-21 23:53:15 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. +(APIServer pid=1) DEBUG 04-21 23:53:15 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +(APIServer pid=1) DEBUG 04-21 23:53:15 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e80deb-1ca2657603acf53a72029f01;435fb308-1aeb-440e-8511-57fefe40e375) +(APIServer pid=1) DEBUG 04-21 23:53:15 [transformers_utils/repo_utils.py:243] +(APIServer pid=1) DEBUG 04-21 23:53:15 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/resolve/main/processor_config.json. +(APIServer pid=1) DEBUG 04-21 23:53:16 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e80dec-4e4380984c55a25e0c831aa6;ff27d06d-bd40-4e2d-b896-41d657dab11e) +(APIServer pid=1) DEBUG 04-21 23:53:16 [transformers_utils/repo_utils.py:243] +(APIServer pid=1) DEBUG 04-21 23:53:16 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/resolve/main/processor_config.json. +DEBUG 04-21 23:53:23 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-21 23:53:23 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-21 23:53:23 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-21 23:53:23 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-21 23:53:23 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-21 23:53:23 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-21 23:53:23 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-21 23:53:23 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-21 23:53:23 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-21 23:53:23 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-21 23:53:23 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-21 23:53:23 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-21 23:53:28 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=244) DEBUG 04-21 23:53:29 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-21 23:53:29 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=244) DEBUG 04-21 23:53:29 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/5f55a193-a5dc-4b3c-8486-b927e244d0cc'], outputs=['ipc:///tmp/2769faea-c662-424e-993f-d1ea27de3c9c'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=244) DEBUG 04-21 23:53:29 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=244) DEBUG 04-21 23:53:29 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=244) DEBUG 04-21 23:53:29 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=244) DEBUG 04-21 23:53:29 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=244) DEBUG 04-21 23:53:29 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=244) INFO 04-21 23:53:29 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='moonshotai/Kimi-VL-A3B-Instruct', speculative_config=None, tokenizer='moonshotai/Kimi-VL-A3B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=moonshotai/Kimi-VL-A3B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=244) DEBUG 04-21 23:53:30 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(EngineCore pid=244) WARNING 04-21 23:53:30 [tokenizers/registry.py:212] Using a slow tokenizer. This might cause a significant slowdown. Consider using a fast tokenizer instead. +(EngineCore pid=244) DEBUG 04-21 23:53:31 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.19:46795 backend=nccl +(EngineCore pid=244) INFO 04-21 23:53:31 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.19:46795 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=244) DEBUG 04-21 23:53:31 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=244) INFO 04-21 23:53:31 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N/A +(EngineCore pid=244) DEBUG 04-21 23:53:31 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776815611.7651086, auto_measure=True +(EngineCore pid=244) DEBUG 04-21 23:53:31 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=244) DEBUG 04-21 23:53:31 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=244) DEBUG 04-21 23:53:31 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=244) DEBUG 04-21 23:53:31 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=244) DEBUG 04-21 23:53:31 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=244) DEBUG 04-21 23:53:31 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=244) DEBUG 04-21 23:53:31 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=244) DEBUG 04-21 23:53:31 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. +(EngineCore pid=244) DEBUG 04-21 23:53:32 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e80dfb-656344db4555b931477bec0a;0deda71c-ac0d-4025-a3eb-3d9396dea2c6) +(EngineCore pid=244) DEBUG 04-21 23:53:32 [transformers_utils/repo_utils.py:243] +(EngineCore pid=244) DEBUG 04-21 23:53:32 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/resolve/main/processor_config.json. +(EngineCore pid=244) DEBUG 04-21 23:53:34 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e80dfe-3feddb5218e61836715685f8;7c2e9747-00f7-4f1c-900e-19c1d1d4f167) +(EngineCore pid=244) DEBUG 04-21 23:53:34 [transformers_utils/repo_utils.py:243] +(EngineCore pid=244) DEBUG 04-21 23:53:34 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/resolve/main/processor_config.json. +(EngineCore pid=244) DEBUG 04-21 23:53:38 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=244) INFO 04-21 23:53:38 [v1/worker/gpu_model_runner.py:4735] Starting to load model moonshotai/Kimi-VL-A3B-Instruct... +(EngineCore pid=244) INFO 04-21 23:53:38 [platforms/cuda.py:390] Using backend AttentionBackendEnum.FLASH_ATTN for vit attention +(EngineCore pid=244) INFO 04-21 23:53:38 [model_executor/.../attention/mm_encoder_attention.py:230] Using AttentionBackendEnum.FLASH_ATTN for MMEncoderAttention. +(EngineCore pid=244) INFO 04-21 23:53:38 [config/vllm.py:790] Asynchronous scheduling is enabled. +(EngineCore pid=244) DEBUG 04-21 23:53:39 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=576, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=True, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {FLASHINFER_MLA: [compute capability not supported], FLASHMLA_SPARSE: [non-sparse not supported]}. +(EngineCore pid=244) INFO 04-21 23:53:39 [platforms/cuda.py:334] Using FLASH_ATTN_MLA attention backend out of potential backends: ['FLASH_ATTN_MLA', 'FLASHMLA', 'TRITON_MLA']. +(EngineCore pid=244) INFO 04-21 23:53:39 [model_executor/.../attention/mla_attention.py:2137] Using FlashAttention prefill for MLA +(EngineCore pid=244) INFO 04-21 23:53:39 [utils/deep_gemm.py:115] DeepGEMM E8M0 enabled on current platform. +(EngineCore pid=244) INFO 04-21 23:53:39 [model_executor/.../oracle/unquantized.py:186] Using TRITON backend for Unquantized MoE +(EngineCore pid=244) DEBUG 04-21 23:53:39 [model_executor/.../runner/default_moe_runner.py:240] Enabled separate cuda stream for MoE shared_experts +(EngineCore pid=244) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=244) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=244) DEBUG 04-21 23:53:39 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=244) DEBUG 04-21 23:53:39 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 27}) +(EngineCore pid=244) DEBUG 04-21 23:53:39 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 82, 'mla_decode_concat_quant_fp8': 27, 'silu_and_mul': 27, 'fused_moe': 26, 'unquantized_fused_moe': 26, 'conv2d': 1, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=244) DEBUG 04-21 23:53:39 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 27}) +(EngineCore pid=244) DEBUG 04-21 23:53:39 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 82, 'mla_decode_concat_quant_fp8': 27, 'silu_and_mul': 27, 'fused_moe': 26, 'unquantized_fused_moe': 26, 'conv2d': 1, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=244) DEBUG 04-21 23:53:39 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=244) DEBUG 04-21 23:53:39 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00006-of-00007.safetensors', 'model-00007-of-00007.safetensors', 'model-00004-of-00007.safetensors', 'model-00002-of-00007.safetensors', 'model-00001-of-00007.safetensors', 'model-00005-of-00007.safetensors', 'model-00003-of-00007.safetensors']] +(EngineCore pid=244) Loading safetensors checkpoint shards: 0% Completed | 0/7 [00:00 +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/forward_context.py +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/mla_attention.py +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/config.py +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/shared_fused_moe.py +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/mla.py +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/deepseek_v2.py +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=244) INFO 04-21 23:54:14 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/b96ff633a9/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=632e15ae31 comp=e546579c48 code=9292e945a1cdc87ebcb24cd65543d194ea60732523e9e0a6221751ff295bdfed dir=/data/.cache/vllm/torch_compile_cache/b96ff633a9/rank_0_0/backbone +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] Vllm config hash: 632e15ae31 +(EngineCore pid=244) INFO 04-21 23:54:15 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.34 s +(EngineCore pid=244) DEBUG 04-21 23:54:15 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=244) DEBUG 04-21 23:54:15 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=244) DEBUG 04-21 23:54:15 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=244) DEBUG 04-21 23:54:15 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(EngineCore pid=244) DEBUG 04-21 23:54:15 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(EngineCore pid=244) DEBUG 04-21 23:54:15 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=244) DEBUG 04-21 23:54:15 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=244) INFO 04-21 23:54:18 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=244) DEBUG 04-21 23:54:18 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/b96ff633a9/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=244) DEBUG 04-21 23:54:18 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=244) DEBUG 04-21 23:54:18 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(EngineCore pid=244) DEBUG 04-21 23:54:18 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(EngineCore pid=244) DEBUG 04-21 23:54:18 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=244) DEBUG 04-21 23:54:18 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(APIServer pid=1) DEBUG 04-21 23:54:19 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=244) DEBUG 04-21 23:54:20 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/b96ff633a9/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=244) DEBUG 04-21 23:54:20 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=244) DEBUG 04-21 23:54:20 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(EngineCore pid=244) DEBUG 04-21 23:54:20 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(EngineCore pid=244) DEBUG 04-21 23:54:20 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=244) DEBUG 04-21 23:54:20 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=244) DEBUG 04-21 23:54:22 [compilation/backends.py:377] Store the 2-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_2', '/data/.cache/vllm/torch_compile_cache/b96ff633a9/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_2') +(EngineCore pid=244) DEBUG 04-21 23:54:23 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=244) DEBUG 04-21 23:54:23 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(EngineCore pid=244) DEBUG 04-21 23:54:23 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(EngineCore pid=244) DEBUG 04-21 23:54:23 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=244) DEBUG 04-21 23:54:23 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=244) DEBUG 04-21 23:54:23 [compilation/backends.py:377] Store the 27-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_27', '/data/.cache/vllm/torch_compile_cache/b96ff633a9/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_27') +(EngineCore pid=244) INFO 04-21 23:54:23 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 8.30 s +(EngineCore pid=244) DEBUG 04-21 23:54:23 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/b96ff633a9/rank_0_0/backbone/computation_graph.py +(EngineCore pid=244) INFO 04-21 23:54:24 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/27aaa92cd5a8b1d618310af6363b4ccd68035f614eff82150e0efa0993b9fb9d/rank_0_0/model +(EngineCore pid=244) INFO 04-21 23:54:24 [compilation/monitor.py:48] torch.compile took 13.76 s in total +(EngineCore pid=244) WARNING 04-21 23:54:25 [model_executor/.../fused_moe/fused_moe.py:1090] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=64,N=1408,device_name=NVIDIA_H100_80GB_HBM3.json +(EngineCore pid=244) INFO 04-21 23:54:26 [compilation/monitor.py:76] Initial profiling/warmup run took 1.94 s +(APIServer pid=1) DEBUG 04-21 23:54:29 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=244) INFO 04-21 23:54:31 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=244) DEBUG 04-21 23:54:31 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=244) INFO 04-21 23:54:31 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=244) DEBUG 04-21 23:54:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-21 23:54:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-21 23:54:32 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-21 23:54:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-21 23:54:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-21 23:54:32 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-21 23:54:32 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 106.00 MiB first-capture + (51-1) × 8.00 MiB per-graph +(EngineCore pid=244) DEBUG 04-21 23:54:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-21 23:54:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-21 23:54:32 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-21 23:54:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-21 23:54:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-21 23:54:32 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-21 23:54:32 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 522.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=244) DEBUG 04-21 23:54:33 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=244) INFO 04-21 23:54:33 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.19 GiB total +(EngineCore pid=244) DEBUG 04-21 23:54:33 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=244) DEBUG 04-21 23:54:33 [v1/worker/gpu_worker.py:430] Free memory after profiling: 46.83 GiB (total), 43.38 GiB (within requested) +(EngineCore pid=244) DEBUG 04-21 23:54:33 [v1/worker/gpu_worker.py:435] Memory profiling takes 23.31 seconds. Total non KV cache memory: 33.91GiB; torch peak memory increase: 2.92GiB; non-torch forward increase memory: 0.25GiB; weights memory: 30.74GiB. +(EngineCore pid=244) INFO 04-21 23:54:33 [v1/worker/gpu_worker.py:436] Available KV cache memory: 41.32 GiB +(EngineCore pid=244) INFO 04-21 23:54:33 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9651 to maintain the same effective KV cache size. +(EngineCore pid=244) INFO 04-21 23:54:33 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 1,426,368 tokens +(EngineCore pid=244) INFO 04-21 23:54:33 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 174.12x +(EngineCore pid=244) 2026-04-21 23:54:33,586 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=244) DEBUG 04-21 23:54:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) 2026-04-21 23:54:33,615 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=244) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-21 23:57:07 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-21 23:57:07 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-21 23:57:07 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-21 23:57:07 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-21 23:57:07 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-21 23:57:07 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model moonshotai/Kimi-VL-A3B-Instruct +(APIServer pid=1) INFO 04-21 23:57:07 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-21 23:57:07 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-21 23:57:07 [entrypoints/utils.py:233] non-default args: {'model_tag': 'moonshotai/Kimi-VL-A3B-Instruct', 'model': 'moonshotai/Kimi-VL-A3B-Instruct', 'trust_remote_code': True, 'max_model_len': 8192, 'tensor_parallel_size': 2, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-21 23:57:07 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-21 23:57:08 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.kimi_vl.KimiVLForConditionalGeneration from cache +(APIServer pid=1) DEBUG 04-21 23:57:08 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0034862 secs +(APIServer pid=1) INFO 04-21 23:57:08 [config/model.py:549] Resolved architecture: KimiVLForConditionalGeneration +(APIServer pid=1) INFO 04-21 23:57:08 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-21 23:57:08 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-21 23:57:08 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-21 23:57:08 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-21 23:57:08 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-21 23:57:08 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-21 23:57:08 [config/parallel.py:743] Defaulting to use mp for distributed inference +(APIServer pid=1) INFO 04-21 23:57:08 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-21 23:57:08 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(APIServer pid=1) DEBUG 04-21 23:57:08 [config/vllm.py:1530] Max num batched tokens below allreduce-rms fusion threshold, allreduce-rms fusion will be enabled for all num_tokens. +(APIServer pid=1) INFO 04-21 23:57:08 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(APIServer pid=1) DEBUG 04-21 23:57:09 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-21 23:57:09 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) WARNING 04-21 23:57:09 [tokenizers/registry.py:212] Using a slow tokenizer. This might cause a significant slowdown. Consider using a fast tokenizer instead. +(APIServer pid=1) DEBUG 04-21 23:57:09 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-21 23:57:09 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. +(APIServer pid=1) DEBUG 04-21 23:57:09 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +(APIServer pid=1) DEBUG 04-21 23:57:10 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e80ed6-087dc9aa3871621f3872a3a4;1e736337-4cc0-4549-a520-1c575d029903) +(APIServer pid=1) DEBUG 04-21 23:57:10 [transformers_utils/repo_utils.py:243] +(APIServer pid=1) DEBUG 04-21 23:57:10 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/resolve/main/processor_config.json. +(APIServer pid=1) DEBUG 04-21 23:57:11 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e80ed7-0f06928f1da9c20c258d924c;e4bb89c5-ea75-4fdc-87a3-765fff118610) +(APIServer pid=1) DEBUG 04-21 23:57:11 [transformers_utils/repo_utils.py:243] +(APIServer pid=1) DEBUG 04-21 23:57:11 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/resolve/main/processor_config.json. +DEBUG 04-21 23:57:18 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-21 23:57:18 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-21 23:57:18 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-21 23:57:18 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-21 23:57:18 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-21 23:57:18 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-21 23:57:18 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-21 23:57:18 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-21 23:57:18 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-21 23:57:18 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-21 23:57:18 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-21 23:57:18 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-21 23:57:23 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(APIServer pid=1) DEBUG 04-21 23:57:24 [v1/engine/utils.py:1042] Waiting for 1 local, 0 remote core engine proc(s) to connect. +(EngineCore pid=245) DEBUG 04-21 23:57:24 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-21 23:57:24 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=245) DEBUG 04-21 23:57:24 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/53a8b05b-abcf-4e7e-aee7-b70e26a95d27'], outputs=['ipc:///tmp/8179865b-8008-423f-9410-4cde8ed0a4ec'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=245) DEBUG 04-21 23:57:24 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=245) DEBUG 04-21 23:57:24 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=245) DEBUG 04-21 23:57:24 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=245) DEBUG 04-21 23:57:24 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=245) DEBUG 04-21 23:57:24 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=245) INFO 04-21 23:57:24 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='moonshotai/Kimi-VL-A3B-Instruct', speculative_config=None, tokenizer='moonshotai/Kimi-VL-A3B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=moonshotai/Kimi-VL-A3B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=245) WARNING 04-21 23:57:24 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. +(EngineCore pid=245) INFO 04-21 23:57:24 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.128.4.181 (local), world_size=2, local_world_size=2 +(EngineCore pid=245) DEBUG 04-21 23:57:24 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/209d9e2f-5438-4f2f-868c-5e831de3572e +(EngineCore pid=245) DEBUG 04-21 23:57:24 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1], buffer_handle=(2, 16777216, 10, 'psm_9b94b597'), local_subscribe_addr='ipc:///tmp/209d9e2f-5438-4f2f-868c-5e831de3572e', local_notify_addr='ipc:///tmp/e98e7316-2b82-4fa7-8b00-5dc7a6284217', remote_subscribe_addr=None, remote_addr_ipv6=False) +DEBUG 04-21 23:57:28 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-21 23:57:28 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-21 23:57:28 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-21 23:57:28 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-21 23:57:28 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-21 23:57:28 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-21 23:57:28 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-21 23:57:28 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-21 23:57:28 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-21 23:57:28 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-21 23:57:28 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-21 23:57:28 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-21 23:57:28 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-21 23:57:28 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-21 23:57:28 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-21 23:57:28 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-21 23:57:28 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-21 23:57:28 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-21 23:57:28 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-21 23:57:28 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-21 23:57:28 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-21 23:57:28 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-21 23:57:28 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-21 23:57:28 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-21 23:57:33 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-21 23:57:33 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-21 23:57:34 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-21 23:57:34 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-21 23:57:34 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-21 23:57:34 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-21 23:57:34 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-21 23:57:34 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-21 23:57:34 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-21 23:57:34 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) DEBUG 04-21 23:57:34 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +DEBUG 04-21 23:57:35 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +DEBUG 04-21 23:57:35 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +WARNING 04-21 23:57:35 [tokenizers/registry.py:212] Using a slow tokenizer. This might cause a significant slowdown. Consider using a fast tokenizer instead. +(Worker pid=444) DEBUG 04-21 23:57:35 [distributed/parallel_state.py:1356] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:55183 backend=nccl +(Worker pid=444) INFO 04-21 23:57:35 [distributed/parallel_state.py:1400] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:55183 backend=nccl +WARNING 04-21 23:57:36 [tokenizers/registry.py:212] Using a slow tokenizer. This might cause a significant slowdown. Consider using a fast tokenizer instead. +(Worker pid=445) DEBUG 04-21 23:57:36 [distributed/parallel_state.py:1356] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:55183 backend=nccl +(Worker pid=445) INFO 04-21 23:57:36 [distributed/parallel_state.py:1400] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:55183 backend=nccl +[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +(Worker pid=445) DEBUG 04-21 23:57:36 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=444) DEBUG 04-21 23:57:36 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +(Worker pid=445) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=445) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=444) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=444) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=444) DEBUG 04-21 23:57:36 [utils/nccl.py:34] Found nccl from library libnccl.so.2 +(Worker pid=444) INFO 04-21 23:57:36 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 +(Worker pid=445) DEBUG 04-21 23:57:37 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=444) DEBUG 04-21 23:57:37 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=444) DEBUG 04-21 23:57:37 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/4634f212-25f1-4a79-bbe7-2d22deb8d514 +(Worker pid=444) DEBUG 04-21 23:57:37 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1], buffer_handle=(1, 4194304, 6, 'psm_eba5827f'), local_subscribe_addr='ipc:///tmp/4634f212-25f1-4a79-bbe7-2d22deb8d514', local_notify_addr='ipc:///tmp/7a2e1266-15b1-4565-b1ec-33a83d7f84c1', remote_subscribe_addr=None, remote_addr_ipv6=False) +(Worker pid=445) DEBUG 04-21 23:57:37 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/4634f212-25f1-4a79-bbe7-2d22deb8d514 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +(Worker pid=444) INFO 04-21 23:57:37 [distributed/parallel_state.py:1716] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N/A +(Worker pid=445) DEBUG 04-21 23:57:38 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.18GiB, total_memory=79.19GiB, cuda_memory=2.01GiB, torch_memory=0.02GiB, non_torch_memory=1.99GiB, timestamp=1776815858.1115682, auto_measure=True +(Worker pid=445) DEBUG 04-21 23:57:38 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=444) DEBUG 04-21 23:57:38 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.18GiB, total_memory=79.19GiB, cuda_memory=2.01GiB, torch_memory=0.02GiB, non_torch_memory=1.99GiB, timestamp=1776815858.153387, auto_measure=True +(Worker pid=444) DEBUG 04-21 23:57:38 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=445) DEBUG 04-21 23:57:38 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=445) DEBUG 04-21 23:57:38 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=445) DEBUG 04-21 23:57:38 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=445) DEBUG 04-21 23:57:38 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=444) DEBUG 04-21 23:57:38 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=444) DEBUG 04-21 23:57:38 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=444) DEBUG 04-21 23:57:38 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=445) DEBUG 04-21 23:57:38 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=444) DEBUG 04-21 23:57:38 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=444) DEBUG 04-21 23:57:38 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(Worker pid=444) DEBUG 04-21 23:57:38 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=445) DEBUG 04-21 23:57:38 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e80ef2-2dea93487f2b57286bbd2791;5057faf0-b328-471e-acd5-73c6b4cd8a8b) +(Worker pid=445) DEBUG 04-21 23:57:38 [transformers_utils/repo_utils.py:243] +(Worker pid=445) DEBUG 04-21 23:57:38 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/resolve/main/processor_config.json. +(Worker pid=444) DEBUG 04-21 23:57:38 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e80ef2-1a640bbc17e245a301bebbd5;373adc4d-7871-4c01-bc7f-8ef75479df0c) +(Worker pid=444) DEBUG 04-21 23:57:38 [transformers_utils/repo_utils.py:243] +(Worker pid=444) DEBUG 04-21 23:57:38 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/resolve/main/processor_config.json. +(Worker pid=445) DEBUG 04-21 23:57:41 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e80ef5-30f4eb86103918785d3573ed;74e46153-3902-467b-ac2e-3792e551f1e7) +(Worker pid=445) DEBUG 04-21 23:57:41 [transformers_utils/repo_utils.py:243] +(Worker pid=445) DEBUG 04-21 23:57:41 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/resolve/main/processor_config.json. +(Worker pid=444) DEBUG 04-21 23:57:41 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e80ef5-19441e254ef6204f13830997;3069dd3a-dab8-4503-98dc-c508429e3cd6) +(Worker pid=444) DEBUG 04-21 23:57:41 [transformers_utils/repo_utils.py:243] +(Worker pid=444) DEBUG 04-21 23:57:41 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/resolve/main/processor_config.json. +(Worker_TP1 pid=445) DEBUG 04-21 23:57:44 [config/vllm.py:1530] Max num batched tokens below allreduce-rms fusion threshold, allreduce-rms fusion will be enabled for all num_tokens. +(Worker pid=444) DEBUG 04-21 23:57:44 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(Worker_TP0 pid=444) INFO 04-21 23:57:44 [v1/worker/gpu_model_runner.py:4735] Starting to load model moonshotai/Kimi-VL-A3B-Instruct... +(Worker_TP1 pid=445) DEBUG 04-21 23:57:44 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP1 pid=445) DEBUG 04-21 23:57:44 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 27}) +(Worker_TP1 pid=445) DEBUG 04-21 23:57:44 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 82, 'mla_decode_concat_quant_fp8': 27, 'silu_and_mul': 27, 'fused_moe': 26, 'unquantized_fused_moe': 26, 'conv2d': 1, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP1 pid=445) DEBUG 04-21 23:57:44 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 27}) +(Worker_TP1 pid=445) DEBUG 04-21 23:57:44 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 82, 'mla_decode_concat_quant_fp8': 27, 'silu_and_mul': 27, 'fused_moe': 26, 'unquantized_fused_moe': 26, 'conv2d': 1, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP1 pid=445) DEBUG 04-21 23:57:44 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP0 pid=444) INFO 04-21 23:57:44 [platforms/cuda.py:390] Using backend AttentionBackendEnum.FLASH_ATTN for vit attention +(Worker_TP0 pid=444) INFO 04-21 23:57:44 [model_executor/.../attention/mm_encoder_attention.py:230] Using AttentionBackendEnum.FLASH_ATTN for MMEncoderAttention. +(Worker_TP0 pid=444) INFO 04-21 23:57:44 [config/vllm.py:790] Asynchronous scheduling is enabled. +(Worker_TP0 pid=444) DEBUG 04-21 23:57:44 [config/vllm.py:1530] Max num batched tokens below allreduce-rms fusion threshold, allreduce-rms fusion will be enabled for all num_tokens. +(Worker_TP0 pid=444) INFO 04-21 23:57:44 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(Worker_TP1 pid=445) DEBUG 04-21 23:57:44 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00007-of-00007.safetensors', 'model-00002-of-00007.safetensors', 'model-00006-of-00007.safetensors', 'model-00001-of-00007.safetensors', 'model-00004-of-00007.safetensors', 'model-00003-of-00007.safetensors', 'model-00005-of-00007.safetensors']] +(Worker_TP0 pid=444) DEBUG 04-21 23:57:44 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=576, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=True, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {FLASHINFER_MLA: [compute capability not supported], FLASHMLA_SPARSE: [non-sparse not supported]}. +(Worker_TP0 pid=444) INFO 04-21 23:57:44 [platforms/cuda.py:334] Using FLASH_ATTN_MLA attention backend out of potential backends: ['FLASH_ATTN_MLA', 'FLASHMLA', 'TRITON_MLA']. +(Worker_TP0 pid=444) INFO 04-21 23:57:44 [model_executor/.../attention/mla_attention.py:2137] Using FlashAttention prefill for MLA +(Worker_TP0 pid=444) INFO 04-21 23:57:44 [utils/deep_gemm.py:115] DeepGEMM E8M0 enabled on current platform. +(Worker_TP0 pid=444) INFO 04-21 23:57:44 [model_executor/.../oracle/unquantized.py:186] Using TRITON backend for Unquantized MoE +(Worker_TP0 pid=444) DEBUG 04-21 23:57:44 [model_executor/.../runner/default_moe_runner.py:240] Enabled separate cuda stream for MoE shared_experts +(APIServer pid=1) DEBUG 04-21 23:57:44 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=444) DEBUG 04-21 23:57:44 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP0 pid=444) DEBUG 04-21 23:57:44 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 27}) +(Worker_TP0 pid=444) DEBUG 04-21 23:57:44 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 82, 'mla_decode_concat_quant_fp8': 27, 'silu_and_mul': 27, 'fused_moe': 26, 'unquantized_fused_moe': 26, 'conv2d': 1, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP0 pid=444) DEBUG 04-21 23:57:44 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 27}) +(Worker_TP0 pid=444) DEBUG 04-21 23:57:44 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 82, 'mla_decode_concat_quant_fp8': 27, 'silu_and_mul': 27, 'fused_moe': 26, 'unquantized_fused_moe': 26, 'conv2d': 1, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP0 pid=444) DEBUG 04-21 23:57:44 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP0 pid=444) DEBUG 04-21 23:57:45 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00003-of-00007.safetensors', 'model-00001-of-00007.safetensors', 'model-00007-of-00007.safetensors', 'model-00004-of-00007.safetensors', 'model-00006-of-00007.safetensors', 'model-00002-of-00007.safetensors', 'model-00005-of-00007.safetensors']] +(Worker_TP0 pid=444) Loading safetensors checkpoint shards: 0% Completed | 0/7 [00:00 +(Worker_TP1 pid=445) DEBUG 04-21 23:58:13 [compilation/decorators.py:528] Start compiling function +(EngineCore pid=245) DEBUG 04-21 23:58:13 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(APIServer pid=1) DEBUG 04-21 23:58:14 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/forward_context.py +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/mla_attention.py +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/config.py +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/shared_fused_moe.py +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/mla.py +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/deepseek_v2.py +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/forward_context.py +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/mla_attention.py +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/config.py +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/shared_fused_moe.py +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/mla.py +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/deepseek_v2.py +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=d8fa51fca8 comp=e546579c48 code=a8ee4e07d8cfb16ff10c7a91bbf306af44d6b5cf8dda22a08a2b5e657ca5d21c dir=/data/.cache/vllm/torch_compile_cache/68d54dc2af/rank_1_0/backbone +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] Vllm config hash: d8fa51fca8 +(Worker_TP0 pid=444) INFO 04-21 23:58:17 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/68d54dc2af/rank_0_0/backbone for vLLM's torch.compile +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=d8fa51fca8 comp=e546579c48 code=a8ee4e07d8cfb16ff10c7a91bbf306af44d6b5cf8dda22a08a2b5e657ca5d21c dir=/data/.cache/vllm/torch_compile_cache/68d54dc2af/rank_0_0/backbone +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] Vllm config hash: d8fa51fca8 +(Worker_TP0 pid=444) INFO 04-21 23:58:17 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.28 s +(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/.../fusion/allreduce_rms_fusion.py:765] Flashinfer max size: 64 MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: 8192 +(Worker_TP0 pid=444) INFO 04-21 23:58:17 [distributed/device_communicators/flashinfer_all_reduce.py:109] Auto-selected flashinfer allreduce backend: trtllm +(Worker_TP0 pid=444) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. +(Worker_TP0 pid=444) return func(*args, **kwargs) +(Worker_TP0 pid=444) DEBUG 04-21 23:58:18 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=0, max_token_num=8192, hidden_dim=2048, dtype=torch.bfloat16 +(Worker_TP1 pid=445) DEBUG 04-21 23:58:18 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=1, max_token_num=8192, hidden_dim=2048, dtype=torch.bfloat16 +(Worker_TP0 pid=444) INFO 04-21 23:58:18 [distributed/device_communicators/flashinfer_all_reduce.py:149] Initialized FlashInfer Allreduce norm fusion workspace with backend=trtllm +(Worker_TP0 pid=444) DEBUG 04-21 23:58:18 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(Worker_TP0 pid=444) DEBUG 04-21 23:58:18 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(Worker_TP0 pid=444) DEBUG 04-21 23:58:18 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=444) DEBUG 04-21 23:58:18 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP0 pid=444) DEBUG 04-21 23:58:18 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 0 patterns +(Worker_TP0 pid=444) DEBUG 04-21 23:58:18 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 1.7 ms +(Worker_TP0 pid=444) DEBUG 04-21 23:58:18 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=444) DEBUG 04-21 23:58:18 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=444) DEBUG 04-21 23:58:18 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(Worker_TP1 pid=445) DEBUG 04-21 23:58:18 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=445) DEBUG 04-21 23:58:18 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP1 pid=445) DEBUG 04-21 23:58:18 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 0 patterns +(Worker_TP1 pid=445) DEBUG 04-21 23:58:18 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 1.6 ms +(Worker_TP1 pid=445) DEBUG 04-21 23:58:18 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=445) DEBUG 04-21 23:58:18 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=445) DEBUG 04-21 23:58:18 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(Worker_TP0 pid=444) INFO 04-21 23:58:21 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(Worker_TP0 pid=444) DEBUG 04-21 23:58:21 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/68d54dc2af/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(Worker_TP0 pid=444) DEBUG 04-21 23:58:21 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=444) DEBUG 04-21 23:58:21 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP1 pid=445) DEBUG 04-21 23:58:21 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=445) DEBUG 04-21 23:58:21 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms +(Worker_TP0 pid=444) DEBUG 04-21 23:58:21 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP0 pid=444) DEBUG 04-21 23:58:21 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 41.4 ms +(Worker_TP0 pid=444) DEBUG 04-21 23:58:21 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=444) DEBUG 04-21 23:58:21 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP0 pid=444) DEBUG 04-21 23:58:21 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP1 pid=445) DEBUG 04-21 23:58:21 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP1 pid=445) DEBUG 04-21 23:58:21 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 41.9 ms +(Worker_TP1 pid=445) DEBUG 04-21 23:58:21 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=445) DEBUG 04-21 23:58:21 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP1 pid=445) DEBUG 04-21 23:58:21 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP0 pid=444) DEBUG 04-21 23:58:22 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/68d54dc2af/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(Worker_TP0 pid=444) DEBUG 04-21 23:58:23 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=444) DEBUG 04-21 23:58:23 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP1 pid=445) DEBUG 04-21 23:58:23 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=445) DEBUG 04-21 23:58:23 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP0 pid=444) DEBUG 04-21 23:58:23 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP0 pid=444) DEBUG 04-21 23:58:23 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.0 ms +(Worker_TP0 pid=444) DEBUG 04-21 23:58:23 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=444) DEBUG 04-21 23:58:23 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP0 pid=444) DEBUG 04-21 23:58:23 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP1 pid=445) DEBUG 04-21 23:58:23 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP1 pid=445) DEBUG 04-21 23:58:23 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 38.5 ms +(Worker_TP1 pid=445) DEBUG 04-21 23:58:23 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=445) DEBUG 04-21 23:58:23 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP1 pid=445) DEBUG 04-21 23:58:23 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP0 pid=444) DEBUG 04-21 23:58:23 [compilation/backends.py:377] Store the 2-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_2', '/data/.cache/vllm/torch_compile_cache/68d54dc2af/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_2') +(Worker_TP0 pid=444) DEBUG 04-21 23:58:24 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=444) DEBUG 04-21 23:58:24 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(Worker_TP1 pid=445) DEBUG 04-21 23:58:24 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=445) DEBUG 04-21 23:58:24 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(Worker_TP0 pid=444) DEBUG 04-21 23:58:24 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP0 pid=444) DEBUG 04-21 23:58:24 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 35.5 ms +(Worker_TP0 pid=444) DEBUG 04-21 23:58:24 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms +(Worker_TP0 pid=444) DEBUG 04-21 23:58:24 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP0 pid=444) DEBUG 04-21 23:58:24 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP1 pid=445) DEBUG 04-21 23:58:24 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP1 pid=445) DEBUG 04-21 23:58:24 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 38.2 ms +(Worker_TP1 pid=445) DEBUG 04-21 23:58:24 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms +(Worker_TP1 pid=445) DEBUG 04-21 23:58:24 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP1 pid=445) DEBUG 04-21 23:58:24 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP0 pid=444) DEBUG 04-21 23:58:24 [compilation/backends.py:377] Store the 27-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_27', '/data/.cache/vllm/torch_compile_cache/68d54dc2af/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_27') +(Worker_TP0 pid=444) INFO 04-21 23:58:24 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 6.48 s +(APIServer pid=1) DEBUG 04-21 23:58:24 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=444) DEBUG 04-21 23:58:24 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/68d54dc2af/rank_0_0/backbone/computation_graph.py +(Worker_TP0 pid=444) INFO 04-21 23:58:25 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/b88d4c6cebccb530609b324c5e6df85458bab759d77baaef56518e181c625670/rank_0_0/model +(Worker_TP0 pid=444) INFO 04-21 23:58:25 [compilation/monitor.py:48] torch.compile took 12.23 s in total +(Worker_TP0 pid=444) WARNING 04-21 23:58:27 [model_executor/.../fused_moe/fused_moe.py:1090] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=64,N=704,device_name=NVIDIA_H100_80GB_HBM3.json +(Worker_TP0 pid=444) INFO 04-21 23:58:27 [compilation/monitor.py:76] Initial profiling/warmup run took 2.16 s +(Worker_TP1 pid=445) INFO 04-21 23:58:33 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP1 pid=445) DEBUG 04-21 23:58:33 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP1 pid=445) INFO 04-21 23:58:33 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_TP0 pid=444) INFO 04-21 23:58:33 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP0 pid=444) DEBUG 04-21 23:58:33 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP0 pid=444) INFO 04-21 23:58:33 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_TP0 pid=444) DEBUG 04-21 23:58:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-21 23:58:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-21 23:58:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-21 23:58:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-21 23:58:33 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=445) DEBUG 04-21 23:58:33 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=445) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-21 23:58:34 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=444) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-21 23:58:34 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=445) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 106.00 MiB first-capture + (51-1) × 8.00 MiB per-graph +(Worker_TP1 pid=445) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 106.00 MiB first-capture + (51-1) × 8.00 MiB per-graph +(Worker_TP0 pid=444) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-21 23:58:34 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=444) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-21 23:58:34 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=445) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-21 23:58:34 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=444) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-21 23:58:34 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=445) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 264.00 MiB first-capture + (51-1) × 8.00 MiB per-graph +(Worker_TP1 pid=445) INFO 04-21 23:58:34 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses +(Worker_TP0 pid=444) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 264.00 MiB first-capture + (51-1) × 8.00 MiB per-graph +(Worker_TP0 pid=444) INFO 04-21 23:58:34 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses +(APIServer pid=1) DEBUG 04-21 23:58:34 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP1 pid=445) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP1 pid=445) INFO 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.04 GiB total +(Worker_TP0 pid=444) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP0 pid=444) INFO 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.04 GiB total +(Worker_TP1 pid=445) DEBUG 04-21 23:58:35 [v1/worker/gpu_worker.py:424] Initial free memory: 77.18 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP1 pid=445) DEBUG 04-21 23:58:35 [v1/worker/gpu_worker.py:430] Free memory after profiling: 58.93 GiB (total), 56.97 GiB (within requested) +(Worker_TP1 pid=445) DEBUG 04-21 23:58:35 [v1/worker/gpu_worker.py:435] Memory profiling takes 22.13 seconds. Total non KV cache memory: 20.45GiB; torch peak memory increase: 2.85GiB; non-torch forward increase memory: 2.07GiB; weights memory: 15.53GiB. +(Worker_TP1 pid=445) INFO 04-21 23:58:35 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9631 to maintain the same effective KV cache size. +(Worker_TP1 pid=445) DEBUG 04-21 23:58:35 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=444) DEBUG 04-21 23:58:35 [v1/worker/gpu_worker.py:424] Initial free memory: 77.18 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP0 pid=444) DEBUG 04-21 23:58:35 [v1/worker/gpu_worker.py:430] Free memory after profiling: 58.93 GiB (total), 56.97 GiB (within requested) +(Worker_TP0 pid=444) DEBUG 04-21 23:58:35 [v1/worker/gpu_worker.py:435] Memory profiling takes 22.13 seconds. Total non KV cache memory: 20.45GiB; torch peak memory increase: 2.85GiB; non-torch forward increase memory: 2.07GiB; weights memory: 15.53GiB. +(Worker_TP0 pid=444) INFO 04-21 23:58:35 [v1/worker/gpu_worker.py:436] Available KV cache memory: 54.78 GiB +(Worker_TP0 pid=444) INFO 04-21 23:58:35 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9631 to maintain the same effective KV cache size. +(Worker_TP0 pid=444) DEBUG 04-21 23:58:35 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=245) DEBUG 04-21 23:58:35 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=245) INFO 04-21 23:58:35 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 1,890,896 tokens +(EngineCore pid=245) INFO 04-21 23:58:35 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 230.82x +(Worker_TP0 pid=444) DEBUG 04-21 23:58:35 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=445) DEBUG 04-21 23:58:35 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=445) 2026-04-21 23:58:35,346 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP0 pid=444) 2026-04-21 23:58:35,346 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP1 pid=445) DEBUG 04-21 23:58:35 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-21 23:58:35 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) 2026-04-21 23:58:35,380 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP1 pid=445) 2026-04-21 23:58:35,380 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP1 pid=445) DEBUG 04-21 23:58:35 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-21 23:58:35 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-21 23:58:35 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=444) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=245) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=245) DEBUG 04-21 23:58:53 [config/vllm.py:1530] Max num batched tokens below allreduce-rms fusion threshold, allreduce-rms fusion will be enabled for all num_tokens. +(EngineCore pid=245) INFO 04-21 23:58:53 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(Worker_TP0 pid=444) DEBUG 04-21 23:58:53 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=445) DEBUG 04-21 23:58:53 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=245) DEBUG 04-21 23:58:53 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=245) DEBUG 04-21 23:58:53 [v1/engine/core.py:1158] EngineCore waiting for work. +(EngineCore pid=245) DEBUG 04-21 23:58:53 [v1/engine/core.py:1158] EngineCore waiting for work. +(APIServer pid=1) INFO 04-21 23:58:53 [entrypoints/openai/api_server.py:590] Supported tasks: ['generate'] +(APIServer pid=1) WARNING 04-21 23:58:53 [config/model.py:1435] Default vLLM sampling parameters have been overridden by the model's `generation_config.json`: `{'temperature': 0.2}`. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`. +(APIServer pid=1) DEBUG 04-21 23:58:54 [renderers/base.py:197] Warming up chat template processing... +(APIServer pid=1) DEBUG 04-21 23:58:54 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e80f3e-31013c70468c9b6a657b0454;72e067e6-2118-4306-9a44-d0fff2b7b1f5) +(APIServer pid=1) DEBUG 04-21 23:58:54 [transformers_utils/repo_utils.py:243] +(APIServer pid=1) DEBUG 04-21 23:58:54 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/resolve/main/processor_config.json. +(Worker_TP1 pid=445) DEBUG 04-21 23:58:54 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=444) DEBUG 04-21 23:58:54 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(APIServer pid=1) INFO 04-21 23:58:55 [renderers/hf.py:314] Detected the chat template content format to be 'openai'. You can set `--chat-template-content-format` to override this. +(APIServer pid=1) DEBUG 04-21 23:58:55 [renderers/base.py:203] Chat template warmup completed in 1.683s +(APIServer pid=1) DEBUG 04-21 23:58:55 [renderers/base.py:218] Warming up multi-modal processing... +(APIServer pid=1) DEBUG 04-21 23:58:55 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e80f3f-79c7822c760ebf2346048bfc;6a44fe89-171a-4323-ae01-fded739ec56a) +(APIServer pid=1) DEBUG 04-21 23:58:55 [transformers_utils/repo_utils.py:243] +(APIServer pid=1) DEBUG 04-21 23:58:55 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/resolve/main/processor_config.json. +(APIServer pid=1) INFO 04-21 23:58:58 [renderers/base.py:231] Multi-modal warmup completed in 2.691s +(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/openai/api_server.py:594] Starting vLLM server on http://0.0.0.0:8000 +(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:37] Available routes are: +(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /openapi.json, Methods: HEAD, GET +(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /docs, Methods: HEAD, GET +(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /docs/oauth2-redirect, Methods: HEAD, GET +(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /redoc, Methods: HEAD, GET +(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /tokenize, Methods: POST +(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /detokenize, Methods: POST +(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /load, Methods: GET +(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /version, Methods: GET +(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /health, Methods: GET +(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /metrics, Methods: GET +(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /v1/models, Methods: GET +(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /ping, Methods: GET +(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /ping, Methods: POST +(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /invocations, Methods: POST +(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /v1/chat/completions, Methods: POST +(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST +(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /v1/responses, Methods: POST +(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET +(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST +(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /v1/completions, Methods: POST +(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /v1/messages, Methods: POST +(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST +(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /inference/v1/generate, Methods: POST +(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /scale_elastic_ep, Methods: POST +(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST +(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /v1/chat/completions/render, Methods: POST +(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /v1/completions/render, Methods: POST +(APIServer pid=1) INFO: Started server process [1] +(APIServer pid=1) INFO: Waiting for application startup. +(APIServer pid=1) INFO: Application startup complete. +(APIServer pid=1) DEBUG 04-21 23:59:03 [v1/engine/async_llm.py:875] Called check_health. +(APIServer pid=1) INFO: 10.128.4.2:57898 - "GET /health HTTP/1.1" 200 OK diff --git a/accuracy/results/v0.19.0/logs/qwen-qwen2---h100-80gb--tp1pp1dp1--8192-dtbf16-kvfp8.log b/accuracy/results/v0.19.0/logs/qwen-qwen2---h100-80gb--tp1pp1dp1--8192-dtbf16-kvfp8.log new file mode 100644 index 00000000..2f58cb26 --- /dev/null +++ b/accuracy/results/v0.19.0/logs/qwen-qwen2---h100-80gb--tp1pp1dp1--8192-dtbf16-kvfp8.log @@ -0,0 +1,750 @@ +DEBUG 04-22 00:25:50 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:25:50 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:25:50 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:25:50 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:25:50 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:25:50 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:25:50 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:25:50 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:25:50 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:25:50 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:25:50 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:25:50 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:25:55 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 00:25:57 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' +DEBUG 04-22 00:25:57 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 00:25:57 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:25:57 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:25:57 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 00:25:57 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:25:57 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 00:25:57 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 00:25:57 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model Qwen/Qwen2.5-7B-Instruct +(APIServer pid=1) INFO 04-22 00:25:57 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 00:25:57 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:25:57 [entrypoints/utils.py:233] non-default args: {'model_tag': 'Qwen/Qwen2.5-7B-Instruct', 'model': 'Qwen/Qwen2.5-7B-Instruct', 'dtype': 'bfloat16', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'kv_cache_dtype': 'fp8', 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 00:25:57 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 00:25:57 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen2.Qwen2ForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 00:25:57 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0014693 secs +(APIServer pid=1) INFO 04-22 00:25:57 [config/model.py:549] Resolved architecture: Qwen2ForCausalLM +(APIServer pid=1) INFO 04-22 00:25:57 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 00:25:57 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 00:25:57 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 00:25:57 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 00:25:57 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 00:25:57 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 00:25:57 [config/cache.py:227] Using fp8 data type to store kv cache. It reduces the GPU memory footprint and boosts the performance. Meanwhile, it may cause accuracy drop without a proper scaling factor. +(APIServer pid=1) INFO 04-22 00:25:57 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 00:25:57 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 00:25:57 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 00:25:57 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:25:58 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:25:58 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 00:26:01 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:26:01 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:26:01 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:26:01 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:26:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:26:01 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:26:01 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:26:01 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:26:01 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:26:01 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:26:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:26:01 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:26:06 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=243) DEBUG 04-22 00:26:08 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 00:26:08 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=243) DEBUG 04-22 00:26:08 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/91f9a72c-0844-4dc5-aadc-a9cca228e026'], outputs=['ipc:///tmp/45be3ca9-00ed-4f3d-90fc-6f4191051c55'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=243) DEBUG 04-22 00:26:08 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=243) DEBUG 04-22 00:26:08 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=243) DEBUG 04-22 00:26:08 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=243) DEBUG 04-22 00:26:08 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=243) DEBUG 04-22 00:26:08 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=243) INFO 04-22 00:26:08 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='Qwen/Qwen2.5-7B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-7B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=fp8, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen2.5-7B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=243) DEBUG 04-22 00:26:08 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.27:47777 backend=nccl +(EngineCore pid=243) INFO 04-22 00:26:08 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.27:47777 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=243) DEBUG 04-22 00:26:08 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=243) INFO 04-22 00:26:08 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=243) DEBUG 04-22 00:26:09 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776817569.088757, auto_measure=True +(EngineCore pid=243) DEBUG 04-22 00:26:09 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=243) DEBUG 04-22 00:26:09 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=243) DEBUG 04-22 00:26:09 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=243) DEBUG 04-22 00:26:09 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=243) DEBUG 04-22 00:26:09 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=243) DEBUG 04-22 00:26:09 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=243) DEBUG 04-22 00:26:09 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=243) DEBUG 04-22 00:26:09 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=243) INFO 04-22 00:26:09 [v1/worker/gpu_model_runner.py:4735] Starting to load model Qwen/Qwen2.5-7B-Instruct... +(EngineCore pid=243) DEBUG 04-22 00:26:09 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=fp8, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {FLEX_ATTENTION: [kv_cache_dtype not supported]}. +(EngineCore pid=243) INFO 04-22 00:26:09 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN']. +(EngineCore pid=243) INFO 04-22 00:26:09 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=243) INFO 04-22 00:26:09 [utils/deep_gemm.py:115] DeepGEMM E8M0 enabled on current platform. +(EngineCore pid=243) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=243) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=243) DEBUG 04-22 00:26:10 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=243) DEBUG 04-22 00:26:10 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=243) DEBUG 04-22 00:26:10 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 57, 'quant_fp8': 28, 'silu_and_mul': 28, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=243) DEBUG 04-22 00:26:10 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=243) DEBUG 04-22 00:26:10 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00004-of-00004.safetensors']] +(EngineCore pid=243) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 +(APIServer pid=1) DEBUG 04-22 00:26:28 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/input_quant_fp8.py +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/utils/quant_utils.py +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=243) INFO 04-22 00:26:28 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/b8336a0fef/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=74c824397d comp=e546579c48 code=0a3e1ce528ff50bb8acbe695e2afa6812d0aec87dbdaec559728313523b0114f dir=/data/.cache/vllm/torch_compile_cache/b8336a0fef/rank_0_0/backbone +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] Vllm config hash: 74c824397d +(EngineCore pid=243) INFO 04-22 00:26:28 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.20 s +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=243) DEBUG 04-22 00:26:29 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 00:26:29 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms +(EngineCore pid=243) DEBUG 04-22 00:26:29 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms +(EngineCore pid=243) DEBUG 04-22 00:26:29 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 00:26:29 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=243) INFO 04-22 00:26:31 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=243) DEBUG 04-22 00:26:31 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/b8336a0fef/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=243) DEBUG 04-22 00:26:31 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 00:26:31 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(EngineCore pid=243) DEBUG 04-22 00:26:31 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.4 ms +(EngineCore pid=243) DEBUG 04-22 00:26:31 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 00:26:31 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=243) DEBUG 04-22 00:26:33 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/b8336a0fef/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=243) DEBUG 04-22 00:26:33 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 00:26:33 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms +(EngineCore pid=243) DEBUG 04-22 00:26:33 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms +(EngineCore pid=243) DEBUG 04-22 00:26:33 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 00:26:33 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(EngineCore pid=243) DEBUG 04-22 00:26:34 [compilation/backends.py:377] Store the 28-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_28', '/data/.cache/vllm/torch_compile_cache/b8336a0fef/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_28') +(EngineCore pid=243) INFO 04-22 00:26:34 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.43 s +(EngineCore pid=243) DEBUG 04-22 00:26:34 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/b8336a0fef/rank_0_0/backbone/computation_graph.py +(EngineCore pid=243) INFO 04-22 00:26:35 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/867349b15c2e587a1763e2f6fedbb0931bd8753bb77f9ec7ed1cb742e42e0af6/rank_0_0/model +(EngineCore pid=243) INFO 04-22 00:26:35 [compilation/monitor.py:48] torch.compile took 11.06 s in total +(EngineCore pid=243) INFO 04-22 00:26:35 [compilation/monitor.py:76] Initial profiling/warmup run took 0.43 s +(APIServer pid=1) DEBUG 04-22 00:26:38 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=243) INFO 04-22 00:26:41 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=243) DEBUG 04-22 00:26:41 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=243) INFO 04-22 00:26:41 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=243) DEBUG 04-22 00:26:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:26:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:26:41 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 00:26:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:26:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:26:41 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 00:26:41 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 160.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(EngineCore pid=243) DEBUG 04-22 00:26:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:26:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:26:41 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 00:26:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:26:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:26:41 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 00:26:41 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 228.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(EngineCore pid=243) DEBUG 04-22 00:26:42 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=243) INFO 04-22 00:26:42 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.61 GiB total +(EngineCore pid=243) DEBUG 04-22 00:26:42 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=243) DEBUG 04-22 00:26:42 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.98 GiB (total), 59.53 GiB (within requested) +(EngineCore pid=243) DEBUG 04-22 00:26:42 [v1/worker/gpu_worker.py:435] Memory profiling takes 18.20 seconds. Total non KV cache memory: 16.7GiB; torch peak memory increase: 2.21GiB; non-torch forward increase memory: 0.24GiB; weights memory: 14.25GiB. +(EngineCore pid=243) INFO 04-22 00:26:42 [v1/worker/gpu_worker.py:436] Available KV cache memory: 58.53 GiB +(EngineCore pid=243) INFO 04-22 00:26:42 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9577 to maintain the same effective KV cache size. +(EngineCore pid=243) INFO 04-22 00:26:42 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 2,192,000 tokens +(EngineCore pid=243) INFO 04-22 00:26:42 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 267.58x +(EngineCore pid=243) 2026-04-22 00:26:42,679 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=243) DEBUG 04-22 00:26:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) 2026-04-22 00:26:42,690 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=243) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:01:19 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:01:19 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 01:01:19 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:01:19 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 01:01:19 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 01:01:19 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model Qwen/Qwen2.5-72B-Instruct +(APIServer pid=1) INFO 04-22 01:01:19 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 01:01:19 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:01:19 [entrypoints/utils.py:233] non-default args: {'model_tag': 'Qwen/Qwen2.5-72B-Instruct', 'model': 'Qwen/Qwen2.5-72B-Instruct', 'max_model_len': 8192, 'tensor_parallel_size': 2, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 01:01:19 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 01:01:20 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen2.Qwen2ForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 01:01:20 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003588 secs +(APIServer pid=1) INFO 04-22 01:01:20 [config/model.py:549] Resolved architecture: Qwen2ForCausalLM +(APIServer pid=1) INFO 04-22 01:01:20 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 01:01:20 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 01:01:20 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 01:01:20 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 01:01:20 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 01:01:20 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 01:01:20 [config/parallel.py:743] Defaulting to use mp for distributed inference +(APIServer pid=1) INFO 04-22 01:01:20 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 01:01:20 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(APIServer pid=1) INFO 04-22 01:01:20 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(APIServer pid=1) DEBUG 04-22 01:01:20 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 01:01:20 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:01:21 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:01:21 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 01:01:25 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:01:25 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:01:25 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:01:25 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:01:25 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:01:25 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:01:25 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:01:25 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:01:25 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:01:25 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:01:25 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:01:25 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:01:29 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=245) DEBUG 04-22 01:01:31 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 01:01:31 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=245) DEBUG 04-22 01:01:31 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/de8dec1b-171c-409d-b94e-f241299493cf'], outputs=['ipc:///tmp/7fdd1015-9215-4d2b-8f42-450f7d7dce64'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=245) DEBUG 04-22 01:01:31 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=245) DEBUG 04-22 01:01:31 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=245) DEBUG 04-22 01:01:31 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=245) DEBUG 04-22 01:01:31 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=245) DEBUG 04-22 01:01:31 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=245) INFO 04-22 01:01:31 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='Qwen/Qwen2.5-72B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-72B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen2.5-72B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [4096, 8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=245) WARNING 04-22 01:01:31 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. +(EngineCore pid=245) INFO 04-22 01:01:31 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.131.7.85 (local), world_size=2, local_world_size=2 +(EngineCore pid=245) DEBUG 04-22 01:01:31 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/8224b69f-9000-4fea-967e-665224f2c06f +(EngineCore pid=245) DEBUG 04-22 01:01:31 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1], buffer_handle=(2, 16777216, 10, 'psm_c7b9805d'), local_subscribe_addr='ipc:///tmp/8224b69f-9000-4fea-967e-665224f2c06f', local_notify_addr='ipc:///tmp/57224114-0656-4088-aa59-2c00950b7e84', remote_subscribe_addr=None, remote_addr_ipv6=False) +DEBUG 04-22 01:01:34 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:01:34 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:01:34 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:01:34 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:01:34 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:01:34 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:01:34 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:01:34 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:01:34 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:01:34 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:01:34 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:01:34 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:01:34 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:01:34 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:01:34 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:01:34 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:01:34 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:01:34 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:01:34 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:01:34 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:01:34 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:01:34 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:01:34 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:01:34 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:01:39 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:01:39 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:01:40 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:01:40 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:01:40 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:01:40 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 01:01:40 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:01:40 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:01:40 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:01:40 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(Worker pid=444) DEBUG 04-22 01:01:41 [distributed/parallel_state.py:1356] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:33837 backend=nccl +(Worker pid=444) INFO 04-22 01:01:41 [distributed/parallel_state.py:1400] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:33837 backend=nccl +(APIServer pid=1) DEBUG 04-22 01:01:41 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker pid=445) DEBUG 04-22 01:01:41 [distributed/parallel_state.py:1356] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:33837 backend=nccl +(Worker pid=445) INFO 04-22 01:01:41 [distributed/parallel_state.py:1400] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:33837 backend=nccl +[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +(Worker pid=444) DEBUG 04-22 01:01:41 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=445) DEBUG 04-22 01:01:41 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +(Worker pid=444) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=444) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=445) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=445) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=444) DEBUG 04-22 01:01:41 [utils/nccl.py:34] Found nccl from library libnccl.so.2 +(Worker pid=444) INFO 04-22 01:01:41 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 +(Worker pid=444) DEBUG 04-22 01:01:42 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=445) DEBUG 04-22 01:01:42 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=444) DEBUG 04-22 01:01:42 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/195cd3ba-554d-48ba-a941-6f3e1935529a +(Worker pid=444) DEBUG 04-22 01:01:42 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1], buffer_handle=(1, 4194304, 6, 'psm_8bfefc5e'), local_subscribe_addr='ipc:///tmp/195cd3ba-554d-48ba-a941-6f3e1935529a', local_notify_addr='ipc:///tmp/393fc9cb-2dfe-4d5e-ba19-e7442b94a71a', remote_subscribe_addr=None, remote_addr_ipv6=False) +(Worker pid=445) DEBUG 04-22 01:01:42 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/195cd3ba-554d-48ba-a941-6f3e1935529a +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(Worker pid=444) INFO 04-22 01:01:42 [distributed/parallel_state.py:1716] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(Worker pid=444) DEBUG 04-22 01:01:42 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.66GiB, total_memory=79.19GiB, cuda_memory=1.53GiB, torch_memory=0.02GiB, non_torch_memory=1.51GiB, timestamp=1776819702.890159, auto_measure=True +(Worker pid=444) DEBUG 04-22 01:01:42 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=445) DEBUG 04-22 01:01:42 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.66GiB, total_memory=79.19GiB, cuda_memory=1.53GiB, torch_memory=0.02GiB, non_torch_memory=1.51GiB, timestamp=1776819702.933578, auto_measure=True +(Worker pid=445) DEBUG 04-22 01:01:42 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=444) DEBUG 04-22 01:01:42 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=444) DEBUG 04-22 01:01:42 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=444) DEBUG 04-22 01:01:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=444) DEBUG 04-22 01:01:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=445) DEBUG 04-22 01:01:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=445) DEBUG 04-22 01:01:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=444) DEBUG 04-22 01:01:43 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(Worker pid=445) DEBUG 04-22 01:01:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=444) DEBUG 04-22 01:01:43 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=445) DEBUG 04-22 01:01:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=445) DEBUG 04-22 01:01:43 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=444) DEBUG 04-22 01:01:43 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(Worker_TP0 pid=444) INFO 04-22 01:01:43 [v1/worker/gpu_model_runner.py:4735] Starting to load model Qwen/Qwen2.5-72B-Instruct... +(Worker_TP0 pid=444) DEBUG 04-22 01:01:43 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(Worker_TP0 pid=444) INFO 04-22 01:01:43 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(Worker_TP0 pid=444) INFO 04-22 01:01:43 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(Worker_TP0 pid=444) DEBUG 04-22 01:01:43 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP0 pid=444) DEBUG 04-22 01:01:43 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP0 pid=444) DEBUG 04-22 01:01:43 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP0 pid=444) DEBUG 04-22 01:01:43 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP1 pid=445) DEBUG 04-22 01:01:43 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP1 pid=445) DEBUG 04-22 01:01:43 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP1 pid=445) DEBUG 04-22 01:01:43 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP1 pid=445) DEBUG 04-22 01:01:43 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP1 pid=445) DEBUG 04-22 01:01:44 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00016-of-00037.safetensors', 'model-00029-of-00037.safetensors', 'model-00003-of-00037.safetensors', 'model-00023-of-00037.safetensors', 'model-00019-of-00037.safetensors', 'model-00011-of-00037.safetensors', 'model-00035-of-00037.safetensors', 'model-00037-of-00037.safetensors', 'model-00024-of-00037.safetensors', 'model-00002-of-00037.safetensors', 'model-00006-of-00037.safetensors', 'model-00027-of-00037.safetensors', 'model-00018-of-00037.safetensors', 'model-00033-of-00037.safetensors', 'model-00009-of-00037.safetensors', 'model-00001-of-00037.safetensors', 'model-00004-of-00037.safetensors', 'model-00031-of-00037.safetensors', 'model-00012-of-00037.safetensors', 'model-00021-of-00037.safetensors', 'model-00026-of-00037.safetensors', 'model-00017-of-00037.safetensors', 'model-00032-of-00037.safetensors', 'model-00022-of-00037.safetensors', 'model-00028-of-00037.safetensors', 'model-00030-of-00037.safetensors', 'model-00013-of-00037.safetensors', 'model-00010-of-00037.safetensors', 'model-00008-of-00037.safetensors', 'model-00034-of-00037.safetensors', 'model-00007-of-00037.safetensors', 'model-00025-of-00037.safetensors', 'model-00020-of-00037.safetensors', 'model-00015-of-00037.safetensors', 'model-00036-of-00037.safetensors', 'model-00005-of-00037.safetensors', 'model-00014-of-00037.safetensors']] +(Worker_TP0 pid=444) DEBUG 04-22 01:01:46 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00020-of-00037.safetensors', 'model-00010-of-00037.safetensors', 'model-00001-of-00037.safetensors', 'model-00024-of-00037.safetensors', 'model-00036-of-00037.safetensors', 'model-00034-of-00037.safetensors', 'model-00007-of-00037.safetensors', 'model-00005-of-00037.safetensors', 'model-00029-of-00037.safetensors', 'model-00006-of-00037.safetensors', 'model-00011-of-00037.safetensors', 'model-00031-of-00037.safetensors', 'model-00008-of-00037.safetensors', 'model-00030-of-00037.safetensors', 'model-00018-of-00037.safetensors', 'model-00017-of-00037.safetensors', 'model-00022-of-00037.safetensors', 'model-00019-of-00037.safetensors', 'model-00021-of-00037.safetensors', 'model-00004-of-00037.safetensors', 'model-00012-of-00037.safetensors', 'model-00037-of-00037.safetensors', 'model-00026-of-00037.safetensors', 'model-00027-of-00037.safetensors', 'model-00033-of-00037.safetensors', 'model-00003-of-00037.safetensors', 'model-00015-of-00037.safetensors', 'model-00016-of-00037.safetensors', 'model-00014-of-00037.safetensors', 'model-00002-of-00037.safetensors', 'model-00013-of-00037.safetensors', 'model-00028-of-00037.safetensors', 'model-00035-of-00037.safetensors', 'model-00009-of-00037.safetensors', 'model-00023-of-00037.safetensors', 'model-00025-of-00037.safetensors', 'model-00032-of-00037.safetensors']] +(Worker_TP0 pid=444) Loading safetensors checkpoint shards: 0% Completed | 0/37 [00:00 +(Worker_TP0 pid=444) DEBUG 04-22 01:03:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 01:03:25 [compilation/decorators.py:528] Start compiling function +(EngineCore pid=245) DEBUG 04-22 01:03:26 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(APIServer pid=1) DEBUG 04-22 01:03:31 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=a494131711 comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/93b5502620/rank_1_0/backbone +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] Vllm config hash: a494131711 +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP0 pid=444) INFO 04-22 01:03:34 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/93b5502620/rank_0_0/backbone for vLLM's torch.compile +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=a494131711 comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/93b5502620/rank_0_0/backbone +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] Vllm config hash: a494131711 +(Worker_TP0 pid=444) INFO 04-22 01:03:34 [compilation/backends.py:1111] Dynamo bytecode transform time: 8.74 s +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/.../fusion/allreduce_rms_fusion.py:765] Flashinfer max size: 64 MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: 4096 +(Worker_TP0 pid=444) INFO 04-22 01:03:34 [distributed/device_communicators/flashinfer_all_reduce.py:109] Auto-selected flashinfer allreduce backend: trtllm +(Worker_TP0 pid=444) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. +(Worker_TP0 pid=444) return func(*args, **kwargs) +(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=0, max_token_num=4096, hidden_dim=8192, dtype=torch.bfloat16 +(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=1, max_token_num=4096, hidden_dim=8192, dtype=torch.bfloat16 +(Worker_TP0 pid=444) INFO 04-22 01:03:34 [distributed/device_communicators/flashinfer_all_reduce.py:149] Initialized FlashInfer Allreduce norm fusion workspace with backend=trtllm +(Worker_TP0 pid=444) DEBUG 04-22 01:03:35 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 4096), (4097, 8192)] +(Worker_TP0 pid=444) DEBUG 04-22 01:03:35 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(Worker_TP1 pid=445) DEBUG 04-22 01:03:36 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=445) DEBUG 04-22 01:03:36 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:03:36 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=444) DEBUG 04-22 01:03:36 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:03:36 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns +(Worker_TP1 pid=445) DEBUG 04-22 01:03:36 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 33.4 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:03:36 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:03:36 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes +(Worker_TP1 pid=445) DEBUG 04-22 01:03:36 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:03:36 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns +(Worker_TP0 pid=444) DEBUG 04-22 01:03:36 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 34.4 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:03:36 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:03:36 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes +(Worker_TP0 pid=444) DEBUG 04-22 01:03:36 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=444) INFO 04-22 01:03:38 [compilation/backends.py:372] Cache the graph of compile range (1, 4096) for later use +(Worker_TP0 pid=444) DEBUG 04-22 01:03:38 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 4096) from inductor_standalone via handle ('artifact_compile_range_1_4096_subgraph_0', '/data/.cache/vllm/torch_compile_cache/93b5502620/rank_0_0/backbone/artifact_compile_range_1_4096_subgraph_0') +(Worker_TP0 pid=444) DEBUG 04-22 01:03:39 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=444) DEBUG 04-22 01:03:39 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:03:39 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) +(Worker_TP0 pid=444) DEBUG 04-22 01:03:39 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:03:39 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=444) DEBUG 04-22 01:03:39 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:03:39 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=445) DEBUG 04-22 01:03:39 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:03:39 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) +(Worker_TP1 pid=445) DEBUG 04-22 01:03:39 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:03:39 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=445) DEBUG 04-22 01:03:39 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=444) INFO 04-22 01:03:39 [compilation/backends.py:372] Cache the graph of compile range (4097, 8192) for later use +(Worker_TP0 pid=444) DEBUG 04-22 01:03:39 [compilation/backends.py:377] Store the 0-th graph for compile range(4097, 8192) from inductor_standalone via handle ('artifact_compile_range_4097_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/93b5502620/rank_0_0/backbone/artifact_compile_range_4097_8192_subgraph_0') +(Worker_TP0 pid=444) DEBUG 04-22 01:03:40 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=444) DEBUG 04-22 01:03:40 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:03:40 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=445) DEBUG 04-22 01:03:40 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:03:40 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP0 pid=444) DEBUG 04-22 01:03:40 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 60.5 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:03:40 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:03:40 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP0 pid=444) DEBUG 04-22 01:03:40 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:03:40 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP1 pid=445) DEBUG 04-22 01:03:40 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 60.6 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:03:40 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:03:40 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP1 pid=445) DEBUG 04-22 01:03:40 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:03:41 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 4096) from inductor_standalone via handle ('artifact_compile_range_1_4096_subgraph_1', '/data/.cache/vllm/torch_compile_cache/93b5502620/rank_0_0/backbone/artifact_compile_range_1_4096_subgraph_1') +(Worker_TP0 pid=444) DEBUG 04-22 01:03:41 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=444) DEBUG 04-22 01:03:41 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:03:41 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) +(Worker_TP0 pid=444) DEBUG 04-22 01:03:41 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:03:41 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=444) DEBUG 04-22 01:03:41 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(APIServer pid=1) DEBUG 04-22 01:03:41 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP1 pid=445) DEBUG 04-22 01:03:41 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=445) DEBUG 04-22 01:03:41 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:03:41 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) +(Worker_TP1 pid=445) DEBUG 04-22 01:03:41 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:03:41 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=445) DEBUG 04-22 01:03:41 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:03:43 [compilation/backends.py:377] Store the 1-th graph for compile range(4097, 8192) from inductor_standalone via handle ('artifact_compile_range_4097_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/93b5502620/rank_0_0/backbone/artifact_compile_range_4097_8192_subgraph_1') +(Worker_TP0 pid=444) DEBUG 04-22 01:03:48 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=444) DEBUG 04-22 01:03:48 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:03:48 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=445) DEBUG 04-22 01:03:48 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:03:48 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP0 pid=444) DEBUG 04-22 01:03:48 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 62.7 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:03:48 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:03:48 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP0 pid=444) DEBUG 04-22 01:03:48 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:03:48 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP1 pid=445) DEBUG 04-22 01:03:48 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 62.6 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:03:48 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:03:48 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP1 pid=445) DEBUG 04-22 01:03:48 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:03:48 [compilation/backends.py:377] Store the 80-th graph for compile range(1, 4096) from inductor_standalone via handle ('artifact_compile_range_1_4096_subgraph_80', '/data/.cache/vllm/torch_compile_cache/93b5502620/rank_0_0/backbone/artifact_compile_range_1_4096_subgraph_80') +(Worker_TP0 pid=444) INFO 04-22 01:03:48 [compilation/backends.py:390] Compiling a graph for compile range (1, 4096) takes 9.89 s +(Worker_TP0 pid=444) DEBUG 04-22 01:03:48 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=444) DEBUG 04-22 01:03:48 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:03:48 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) +(Worker_TP0 pid=444) DEBUG 04-22 01:03:48 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:03:48 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=444) DEBUG 04-22 01:03:48 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:03:48 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=445) DEBUG 04-22 01:03:48 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:03:48 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) +(Worker_TP1 pid=445) DEBUG 04-22 01:03:48 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:03:48 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=445) DEBUG 04-22 01:03:48 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:03:49 [compilation/backends.py:377] Store the 80-th graph for compile range(4097, 8192) from inductor_standalone via handle ('artifact_compile_range_4097_8192_subgraph_80', '/data/.cache/vllm/torch_compile_cache/93b5502620/rank_0_0/backbone/artifact_compile_range_4097_8192_subgraph_80') +(Worker_TP0 pid=444) INFO 04-22 01:03:49 [compilation/backends.py:390] Compiling a graph for compile range (4097, 8192) takes 10.81 s +(Worker_TP0 pid=444) DEBUG 04-22 01:03:49 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/93b5502620/rank_0_0/backbone/computation_graph.py +(APIServer pid=1) DEBUG 04-22 01:03:51 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=444) INFO 04-22 01:03:52 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/2a4892efff25a14a24b24f1dce28c4ec46ef2e01c74d72f0a4f8bb5a27b1f34a/rank_0_0/model +(Worker_TP0 pid=444) INFO 04-22 01:03:52 [compilation/monitor.py:48] torch.compile took 26.78 s in total +(Worker_TP0 pid=444) INFO 04-22 01:03:53 [compilation/monitor.py:76] Initial profiling/warmup run took 1.37 s +(Worker_TP1 pid=445) INFO 04-22 01:03:59 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP1 pid=445) DEBUG 04-22 01:03:59 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP1 pid=445) INFO 04-22 01:03:59 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_TP0 pid=444) INFO 04-22 01:03:59 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP0 pid=444) DEBUG 04-22 01:03:59 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP0 pid=444) INFO 04-22 01:03:59 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_TP1 pid=445) DEBUG 04-22 01:03:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 01:03:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 01:03:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 01:03:59 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=444) DEBUG 04-22 01:03:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 01:03:59 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=445) DEBUG 04-22 01:03:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 01:03:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 01:03:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 01:03:59 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=444) DEBUG 04-22 01:03:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 01:03:59 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=444) DEBUG 04-22 01:04:00 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 152.00 MiB first-capture + (51-1) × 18.00 MiB per-graph +(Worker_TP1 pid=445) DEBUG 04-22 01:04:00 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 152.00 MiB first-capture + (51-1) × 18.00 MiB per-graph +(Worker_TP0 pid=444) DEBUG 04-22 01:04:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 01:04:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 01:04:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 01:04:00 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=444) DEBUG 04-22 01:04:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 01:04:00 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=445) DEBUG 04-22 01:04:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 01:04:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 01:04:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 01:04:00 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=444) DEBUG 04-22 01:04:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 01:04:00 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=445) DEBUG 04-22 01:04:00 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 266.00 MiB first-capture + (51-1) × 10.00 MiB per-graph +(Worker_TP1 pid=445) INFO 04-22 01:04:00 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses +(Worker_TP0 pid=444) DEBUG 04-22 01:04:00 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 266.00 MiB first-capture + (51-1) × 10.00 MiB per-graph +(Worker_TP0 pid=444) INFO 04-22 01:04:00 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses +(Worker_TP0 pid=444) DEBUG 04-22 01:04:01 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP0 pid=444) INFO 04-22 01:04:01 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.63 GiB total +(Worker_TP1 pid=445) DEBUG 04-22 01:04:01 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP1 pid=445) INFO 04-22 01:04:01 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.63 GiB total +(APIServer pid=1) DEBUG 04-22 01:04:01 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP1 pid=445) DEBUG 04-22 01:04:01 [v1/worker/gpu_worker.py:424] Initial free memory: 77.66 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP1 pid=445) DEBUG 04-22 01:04:01 [v1/worker/gpu_worker.py:430] Free memory after profiling: 6.39 GiB (total), 3.96 GiB (within requested) +(Worker_TP1 pid=445) DEBUG 04-22 01:04:01 [v1/worker/gpu_worker.py:435] Memory profiling takes 35.86 seconds. Total non KV cache memory: 72.19GiB; torch peak memory increase: 2.29GiB; non-torch forward increase memory: 2.09GiB; weights memory: 67.8GiB. +(Worker_TP1 pid=445) INFO 04-22 01:04:01 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9705 to maintain the same effective KV cache size. +(Worker_TP1 pid=445) DEBUG 04-22 01:04:01 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=444) DEBUG 04-22 01:04:01 [v1/worker/gpu_worker.py:424] Initial free memory: 77.66 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP0 pid=444) DEBUG 04-22 01:04:01 [v1/worker/gpu_worker.py:430] Free memory after profiling: 6.39 GiB (total), 3.96 GiB (within requested) +(Worker_TP0 pid=444) DEBUG 04-22 01:04:01 [v1/worker/gpu_worker.py:435] Memory profiling takes 35.86 seconds. Total non KV cache memory: 72.19GiB; torch peak memory increase: 2.29GiB; non-torch forward increase memory: 2.09GiB; weights memory: 67.8GiB. +(Worker_TP0 pid=444) INFO 04-22 01:04:01 [v1/worker/gpu_worker.py:436] Available KV cache memory: 3.04 GiB +(Worker_TP0 pid=444) INFO 04-22 01:04:01 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9705 to maintain the same effective KV cache size. +(Worker_TP0 pid=444) DEBUG 04-22 01:04:01 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=245) DEBUG 04-22 01:04:01 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=245) INFO 04-22 01:04:01 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 19,920 tokens +(EngineCore pid=245) INFO 04-22 01:04:01 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 2.43x +(Worker_TP0 pid=444) DEBUG 04-22 01:04:01 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=445) DEBUG 04-22 01:04:01 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=444) INFO 04-22 01:04:01 [v1/worker/gpu_worker.py:578] Compile and warming up model for size 8192 +(Worker_TP1 pid=445) INFO 04-22 01:04:01 [v1/worker/gpu_worker.py:578] Compile and warming up model for size 8192 +(Worker_TP0 pid=444) DEBUG 04-22 01:04:01 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 01:04:01 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) 2026-04-22 01:04:01,715 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP1 pid=445) 2026-04-22 01:04:01,715 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP0 pid=444) DEBUG 04-22 01:04:01 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 01:04:01 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=245) DEBUG 04-22 01:04:02 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=444) 2026-04-22 01:04:02,600 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP1 pid=445) 2026-04-22 01:04:02,600 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP0 pid=444) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=245) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=245) INFO 04-22 01:04:13 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(Worker_TP0 pid=444) DEBUG 04-22 01:04:13 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=445) DEBUG 04-22 01:04:13 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=245) DEBUG 04-22 01:04:13 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=245) DEBUG 04-22 01:04:13 [v1/engine/core.py:1158] EngineCore waiting for work. +(EngineCore pid=245) DEBUG 04-22 01:04:13 [v1/engine/core.py:1158] EngineCore waiting for work. +(APIServer pid=1) INFO 04-22 01:04:13 [entrypoints/openai/api_server.py:590] Supported tasks: ['generate'] +(APIServer pid=1) WARNING 04-22 01:04:13 [config/model.py:1435] Default vLLM sampling parameters have been overridden by the model's `generation_config.json`: `{'repetition_penalty': 1.05, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8}`. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`. +(APIServer pid=1) DEBUG 04-22 01:04:13 [renderers/base.py:197] Warming up chat template processing... +(APIServer pid=1) DEBUG 04-22 01:04:14 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e81e8d-656378cd6e455ee10a8c4389;0c25ab62-8556-4c4a-ae58-f8c6b6ad7f4c) +(APIServer pid=1) DEBUG 04-22 01:04:14 [transformers_utils/repo_utils.py:243] +(APIServer pid=1) DEBUG 04-22 01:04:14 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/Qwen/Qwen2.5-72B-Instruct/resolve/main/processor_config.json. +(APIServer pid=1) DEBUG 04-22 01:04:14 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e81e8e-69d19a9d3a125bad77599992;b4b8e27f-b6b6-4c43-8945-4a13b3734d5e) +(APIServer pid=1) DEBUG 04-22 01:04:14 [transformers_utils/repo_utils.py:243] +(APIServer pid=1) DEBUG 04-22 01:04:14 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/Qwen/Qwen2.5-72B-Instruct/resolve/main/preprocessor_config.json. +(Worker_TP0 pid=444) DEBUG 04-22 01:04:14 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=445) DEBUG 04-22 01:04:14 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(APIServer pid=1) INFO 04-22 01:04:14 [renderers/hf.py:314] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this. +(APIServer pid=1) DEBUG 04-22 01:04:14 [renderers/base.py:203] Chat template warmup completed in 0.664s +(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/openai/api_server.py:594] Starting vLLM server on http://0.0.0.0:8000 +(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:37] Available routes are: +(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /openapi.json, Methods: HEAD, GET +(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /docs, Methods: HEAD, GET +(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /docs/oauth2-redirect, Methods: HEAD, GET +(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /redoc, Methods: HEAD, GET +(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /tokenize, Methods: POST +(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /detokenize, Methods: POST +(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /load, Methods: GET +(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /version, Methods: GET +(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /health, Methods: GET +(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /metrics, Methods: GET +(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /v1/models, Methods: GET +(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /ping, Methods: GET +(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /ping, Methods: POST +(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /invocations, Methods: POST +(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /v1/chat/completions, Methods: POST +(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST +(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /v1/responses, Methods: POST +(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET +(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST +(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /v1/completions, Methods: POST +(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /v1/messages, Methods: POST +(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST +(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /inference/v1/generate, Methods: POST +(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /scale_elastic_ep, Methods: POST +(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST +(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /v1/chat/completions/render, Methods: POST +(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /v1/completions/render, Methods: POST +(APIServer pid=1) INFO: Started server process [1] +(APIServer pid=1) INFO: Waiting for application startup. +(APIServer pid=1) INFO: Application startup complete. +(APIServer pid=1) DEBUG 04-22 01:04:18 [v1/engine/async_llm.py:875] Called check_health. +(APIServer pid=1) INFO: 10.131.6.2:33822 - "GET /health HTTP/1.1" 200 OK diff --git a/accuracy/results/v0.19.0/logs/qwen-qwen2-5-7b-i--h100-80gb--tp1pp1dp1--8192-dtbf16.log b/accuracy/results/v0.19.0/logs/qwen-qwen2-5-7b-i--h100-80gb--tp1pp1dp1--8192-dtbf16.log new file mode 100644 index 00000000..472a90e9 --- /dev/null +++ b/accuracy/results/v0.19.0/logs/qwen-qwen2-5-7b-i--h100-80gb--tp1pp1dp1--8192-dtbf16.log @@ -0,0 +1,746 @@ +DEBUG 04-22 00:24:34 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:24:34 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:24:34 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:24:34 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:24:34 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:24:34 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:24:34 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:24:34 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:24:34 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:24:34 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:24:34 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:24:34 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:24:39 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 00:24:41 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' +DEBUG 04-22 00:24:41 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 00:24:41 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:24:41 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:24:41 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 00:24:41 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:24:41 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 00:24:41 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 00:24:41 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model Qwen/Qwen2.5-7B-Instruct +(APIServer pid=1) INFO 04-22 00:24:41 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 00:24:41 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:24:41 [entrypoints/utils.py:233] non-default args: {'model_tag': 'Qwen/Qwen2.5-7B-Instruct', 'model': 'Qwen/Qwen2.5-7B-Instruct', 'dtype': 'bfloat16', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 00:24:41 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 00:24:41 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen2.Qwen2ForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 00:24:41 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003835 secs +(APIServer pid=1) INFO 04-22 00:24:41 [config/model.py:549] Resolved architecture: Qwen2ForCausalLM +(APIServer pid=1) INFO 04-22 00:24:41 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 00:24:41 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 00:24:41 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 00:24:41 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 00:24:41 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 00:24:41 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 00:24:41 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 00:24:41 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 00:24:41 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 00:24:41 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:24:42 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:24:42 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 00:24:45 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:24:45 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:24:45 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:24:45 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:24:45 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:24:45 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:24:45 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:24:45 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:24:45 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:24:45 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:24:45 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:24:45 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:24:50 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=244) DEBUG 04-22 00:24:51 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 00:24:51 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=244) DEBUG 04-22 00:24:51 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/7e7d4e07-bf7f-4693-b1d1-eb24ab15edc5'], outputs=['ipc:///tmp/6b4872f8-b412-42ab-a84a-a9ba659e3875'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=244) DEBUG 04-22 00:24:51 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=244) DEBUG 04-22 00:24:52 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=244) DEBUG 04-22 00:24:52 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=244) DEBUG 04-22 00:24:52 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=244) DEBUG 04-22 00:24:52 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=244) INFO 04-22 00:24:52 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='Qwen/Qwen2.5-7B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-7B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen2.5-7B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=244) DEBUG 04-22 00:24:52 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.4.194:50973 backend=nccl +(EngineCore pid=244) INFO 04-22 00:24:52 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.4.194:50973 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=244) DEBUG 04-22 00:24:52 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=244) INFO 04-22 00:24:52 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=244) DEBUG 04-22 00:24:52 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776817492.8586414, auto_measure=True +(EngineCore pid=244) DEBUG 04-22 00:24:52 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=244) DEBUG 04-22 00:24:52 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=244) DEBUG 04-22 00:24:52 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=244) DEBUG 04-22 00:24:52 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=244) DEBUG 04-22 00:24:52 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=244) DEBUG 04-22 00:24:53 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=244) DEBUG 04-22 00:24:53 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=244) DEBUG 04-22 00:24:53 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=244) INFO 04-22 00:24:53 [v1/worker/gpu_model_runner.py:4735] Starting to load model Qwen/Qwen2.5-7B-Instruct... +(EngineCore pid=244) DEBUG 04-22 00:24:53 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=244) INFO 04-22 00:24:53 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=244) INFO 04-22 00:24:53 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=244) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=244) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=244) DEBUG 04-22 00:24:53 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=244) DEBUG 04-22 00:24:53 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=244) DEBUG 04-22 00:24:53 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 57, 'silu_and_mul': 28, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=244) DEBUG 04-22 00:24:53 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=244) DEBUG 04-22 00:24:54 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00001-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00002-of-00004.safetensors']] +(EngineCore pid=244) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=244) INFO 04-22 00:25:11 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/7a80f98c72/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=1a6080d116 comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/7a80f98c72/rank_0_0/backbone +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] Vllm config hash: 1a6080d116 +(EngineCore pid=244) INFO 04-22 00:25:11 [compilation/backends.py:1111] Dynamo bytecode transform time: 3.50 s +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(APIServer pid=1) DEBUG 04-22 00:25:12 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=244) DEBUG 04-22 00:25:12 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=244) DEBUG 04-22 00:25:12 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(EngineCore pid=244) DEBUG 04-22 00:25:12 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms +(EngineCore pid=244) DEBUG 04-22 00:25:12 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=244) DEBUG 04-22 00:25:12 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=244) INFO 04-22 00:25:13 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=244) DEBUG 04-22 00:25:13 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/7a80f98c72/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=244) DEBUG 04-22 00:25:14 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=244) DEBUG 04-22 00:25:14 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(EngineCore pid=244) DEBUG 04-22 00:25:14 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.2 ms +(EngineCore pid=244) DEBUG 04-22 00:25:14 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=244) DEBUG 04-22 00:25:14 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=244) DEBUG 04-22 00:25:15 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/7a80f98c72/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=244) DEBUG 04-22 00:25:16 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=244) DEBUG 04-22 00:25:16 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms +(EngineCore pid=244) DEBUG 04-22 00:25:16 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms +(EngineCore pid=244) DEBUG 04-22 00:25:16 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=244) DEBUG 04-22 00:25:16 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(EngineCore pid=244) DEBUG 04-22 00:25:16 [compilation/backends.py:377] Store the 28-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_28', '/data/.cache/vllm/torch_compile_cache/7a80f98c72/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_28') +(EngineCore pid=244) INFO 04-22 00:25:16 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.04 s +(EngineCore pid=244) DEBUG 04-22 00:25:16 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/7a80f98c72/rank_0_0/backbone/computation_graph.py +(EngineCore pid=244) INFO 04-22 00:25:17 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/17c675e99ca757334c44a733bfd7950765a3ca0d859504322713f76ad4aa0025/rank_0_0/model +(EngineCore pid=244) INFO 04-22 00:25:17 [compilation/monitor.py:48] torch.compile took 10.20 s in total +(EngineCore pid=244) INFO 04-22 00:25:18 [compilation/monitor.py:76] Initial profiling/warmup run took 0.43 s +(APIServer pid=1) DEBUG 04-22 00:25:22 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=244) INFO 04-22 00:25:23 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=244) DEBUG 04-22 00:25:23 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=244) INFO 04-22 00:25:23 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=244) DEBUG 04-22 00:25:23 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:25:23 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:25:23 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-22 00:25:23 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:25:23 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:25:23 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-22 00:25:23 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 160.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(EngineCore pid=244) DEBUG 04-22 00:25:23 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:25:23 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:25:23 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-22 00:25:23 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:25:23 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:25:23 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-22 00:25:23 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 228.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(EngineCore pid=244) DEBUG 04-22 00:25:24 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=244) INFO 04-22 00:25:24 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.61 GiB total +(EngineCore pid=244) DEBUG 04-22 00:25:24 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=244) DEBUG 04-22 00:25:24 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.98 GiB (total), 59.53 GiB (within requested) +(EngineCore pid=244) DEBUG 04-22 00:25:24 [v1/worker/gpu_worker.py:435] Memory profiling takes 17.20 seconds. Total non KV cache memory: 16.7GiB; torch peak memory increase: 2.21GiB; non-torch forward increase memory: 0.24GiB; weights memory: 14.25GiB. +(EngineCore pid=244) INFO 04-22 00:25:24 [v1/worker/gpu_worker.py:436] Available KV cache memory: 58.53 GiB +(EngineCore pid=244) INFO 04-22 00:25:24 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9577 to maintain the same effective KV cache size. +(EngineCore pid=244) INFO 04-22 00:25:24 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 1,096,000 tokens +(EngineCore pid=244) INFO 04-22 00:25:24 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 133.79x +(EngineCore pid=244) 2026-04-22 00:25:25,237 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=244) DEBUG 04-22 00:25:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) 2026-04-22 00:25:25,244 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=244) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:51:46 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:51:46 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 01:51:47 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:51:47 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 01:51:47 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 01:51:47 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model Qwen/Qwen2.5-7B-Instruct +(APIServer pid=1) INFO 04-22 01:51:47 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 01:51:47 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:51:47 [entrypoints/utils.py:233] non-default args: {'model_tag': 'Qwen/Qwen2.5-7B-Instruct', 'model': 'Qwen/Qwen2.5-7B-Instruct', 'max_model_len': 16384, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 01:51:47 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 01:51:47 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen2.Qwen2ForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 01:51:47 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003766 secs +(APIServer pid=1) INFO 04-22 01:51:47 [config/model.py:549] Resolved architecture: Qwen2ForCausalLM +(APIServer pid=1) INFO 04-22 01:51:47 [config/model.py:1678] Using max model len 16384 +(APIServer pid=1) DEBUG 04-22 01:51:47 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 01:51:47 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 01:51:47 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 01:51:47 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 01:51:47 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 01:51:47 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 01:51:47 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 01:51:47 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 01:51:47 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:51:47 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:51:47 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 01:51:51 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:51:51 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:51:51 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:51:51 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:51:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:51:51 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:51:51 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:51:51 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:51:51 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:51:51 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:51:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:51:51 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:51:56 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=242) DEBUG 04-22 01:51:57 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 01:51:57 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=242) DEBUG 04-22 01:51:57 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/2c69b5ce-1cf7-406f-854c-eb1818608546'], outputs=['ipc:///tmp/779d537c-a3ed-4354-9f61-af1e644c1cb0'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=242) DEBUG 04-22 01:51:57 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=242) DEBUG 04-22 01:51:57 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=242) DEBUG 04-22 01:51:57 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=242) DEBUG 04-22 01:51:57 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=242) DEBUG 04-22 01:51:57 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=242) INFO 04-22 01:51:57 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='Qwen/Qwen2.5-7B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-7B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16384, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen2.5-7B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=242) DEBUG 04-22 01:51:58 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.41:55703 backend=nccl +(EngineCore pid=242) INFO 04-22 01:51:58 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.41:55703 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=242) DEBUG 04-22 01:51:58 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=242) INFO 04-22 01:51:58 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=242) DEBUG 04-22 01:51:58 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776822718.9894435, auto_measure=True +(EngineCore pid=242) DEBUG 04-22 01:51:58 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=242) DEBUG 04-22 01:51:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=242) DEBUG 04-22 01:51:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=242) DEBUG 04-22 01:51:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=242) DEBUG 04-22 01:51:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=242) DEBUG 04-22 01:51:59 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=242) DEBUG 04-22 01:51:59 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=242) DEBUG 04-22 01:51:59 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=242) INFO 04-22 01:51:59 [v1/worker/gpu_model_runner.py:4735] Starting to load model Qwen/Qwen2.5-7B-Instruct... +(EngineCore pid=242) DEBUG 04-22 01:51:59 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=242) INFO 04-22 01:51:59 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=242) INFO 04-22 01:51:59 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=242) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=242) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=242) DEBUG 04-22 01:51:59 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=242) DEBUG 04-22 01:51:59 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=242) DEBUG 04-22 01:51:59 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 57, 'silu_and_mul': 28, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=242) DEBUG 04-22 01:51:59 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=242) DEBUG 04-22 01:52:00 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00004-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00002-of-00004.safetensors', 'model-00001-of-00004.safetensors']] +(EngineCore pid=242) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=242) INFO 04-22 01:52:12 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/71bc44c872/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=759a699594 comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/71bc44c872/rank_0_0/backbone +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] Vllm config hash: 759a699594 +(EngineCore pid=242) INFO 04-22 01:52:12 [compilation/backends.py:1111] Dynamo bytecode transform time: 3.55 s +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=242) DEBUG 04-22 01:52:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=242) DEBUG 04-22 01:52:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(EngineCore pid=242) DEBUG 04-22 01:52:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms +(EngineCore pid=242) DEBUG 04-22 01:52:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=242) DEBUG 04-22 01:52:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=242) INFO 04-22 01:52:14 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=242) DEBUG 04-22 01:52:14 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/71bc44c872/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=242) DEBUG 04-22 01:52:15 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=242) DEBUG 04-22 01:52:15 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(EngineCore pid=242) DEBUG 04-22 01:52:15 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms +(EngineCore pid=242) DEBUG 04-22 01:52:15 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=242) DEBUG 04-22 01:52:15 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=242) DEBUG 04-22 01:52:16 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/71bc44c872/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=242) DEBUG 04-22 01:52:17 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=242) DEBUG 04-22 01:52:17 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.4 ms +(EngineCore pid=242) DEBUG 04-22 01:52:17 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms +(EngineCore pid=242) DEBUG 04-22 01:52:17 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=242) DEBUG 04-22 01:52:17 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(EngineCore pid=242) DEBUG 04-22 01:52:17 [compilation/backends.py:377] Store the 28-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_28', '/data/.cache/vllm/torch_compile_cache/71bc44c872/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_28') +(EngineCore pid=242) INFO 04-22 01:52:17 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.19 s +(EngineCore pid=242) DEBUG 04-22 01:52:17 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/71bc44c872/rank_0_0/backbone/computation_graph.py +(APIServer pid=1) DEBUG 04-22 01:52:17 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=242) INFO 04-22 01:52:18 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/3040b79a955eecaf30ddc5d0b4e18acda7953487e706565418941f6941fe40ca/rank_0_0/model +(EngineCore pid=242) INFO 04-22 01:52:18 [compilation/monitor.py:48] torch.compile took 10.40 s in total +(EngineCore pid=242) INFO 04-22 01:52:19 [compilation/monitor.py:76] Initial profiling/warmup run took 0.44 s +(EngineCore pid=242) INFO 04-22 01:52:24 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=242) DEBUG 04-22 01:52:24 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=242) INFO 04-22 01:52:24 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=242) DEBUG 04-22 01:52:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=242) DEBUG 04-22 01:52:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=242) DEBUG 04-22 01:52:25 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=242) DEBUG 04-22 01:52:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=242) DEBUG 04-22 01:52:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=242) DEBUG 04-22 01:52:25 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=242) DEBUG 04-22 01:52:25 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 160.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(EngineCore pid=242) DEBUG 04-22 01:52:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=242) DEBUG 04-22 01:52:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=242) DEBUG 04-22 01:52:25 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=242) DEBUG 04-22 01:52:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=242) DEBUG 04-22 01:52:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=242) DEBUG 04-22 01:52:25 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=242) DEBUG 04-22 01:52:25 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 228.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(EngineCore pid=242) DEBUG 04-22 01:52:25 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=242) INFO 04-22 01:52:25 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.61 GiB total +(EngineCore pid=242) DEBUG 04-22 01:52:26 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=242) DEBUG 04-22 01:52:26 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.98 GiB (total), 59.53 GiB (within requested) +(EngineCore pid=242) DEBUG 04-22 01:52:26 [v1/worker/gpu_worker.py:435] Memory profiling takes 17.58 seconds. Total non KV cache memory: 16.7GiB; torch peak memory increase: 2.21GiB; non-torch forward increase memory: 0.24GiB; weights memory: 14.25GiB. +(EngineCore pid=242) INFO 04-22 01:52:26 [v1/worker/gpu_worker.py:436] Available KV cache memory: 58.53 GiB +(EngineCore pid=242) INFO 04-22 01:52:26 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9577 to maintain the same effective KV cache size. +(EngineCore pid=242) INFO 04-22 01:52:26 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 1,095,968 tokens +(EngineCore pid=242) INFO 04-22 01:52:26 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 16,384 tokens per request: 66.89x +(EngineCore pid=242) 2026-04-22 01:52:26,135 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=242) DEBUG 04-22 01:52:26 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=242) 2026-04-22 01:52:26,143 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=242) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:52:57 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:52:57 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 01:52:57 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:52:57 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 01:52:57 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 01:52:57 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model Qwen/Qwen2.5-7B-Instruct +(APIServer pid=1) INFO 04-22 01:52:57 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 01:52:57 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:52:57 [entrypoints/utils.py:233] non-default args: {'model_tag': 'Qwen/Qwen2.5-7B-Instruct', 'model': 'Qwen/Qwen2.5-7B-Instruct', 'max_model_len': 32768, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 01:52:57 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 01:52:58 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen2.Qwen2ForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 01:52:58 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0004144 secs +(APIServer pid=1) INFO 04-22 01:52:58 [config/model.py:549] Resolved architecture: Qwen2ForCausalLM +(APIServer pid=1) INFO 04-22 01:52:58 [config/model.py:1678] Using max model len 32768 +(APIServer pid=1) DEBUG 04-22 01:52:58 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 01:52:58 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 01:52:58 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 01:52:58 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 01:52:58 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 01:52:58 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 01:52:58 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 01:52:58 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 01:52:58 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:52:58 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:52:58 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 01:53:02 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:53:02 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:53:02 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:53:02 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:53:02 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:53:02 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:53:02 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:53:02 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:53:02 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:53:02 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:53:02 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:53:02 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:53:07 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=244) DEBUG 04-22 01:53:08 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 01:53:08 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=244) DEBUG 04-22 01:53:08 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/22293a4a-e2e4-42a4-80a9-340df849e155'], outputs=['ipc:///tmp/c14af7ba-85e0-4a40-9bdd-815d5a2e4143'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=244) DEBUG 04-22 01:53:08 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=244) DEBUG 04-22 01:53:08 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=244) DEBUG 04-22 01:53:08 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=244) DEBUG 04-22 01:53:08 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=244) DEBUG 04-22 01:53:08 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=244) INFO 04-22 01:53:08 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='Qwen/Qwen2.5-7B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-7B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen2.5-7B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=244) DEBUG 04-22 01:53:09 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.42:54577 backend=nccl +(EngineCore pid=244) INFO 04-22 01:53:09 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.42:54577 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=244) DEBUG 04-22 01:53:09 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=244) INFO 04-22 01:53:09 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=244) DEBUG 04-22 01:53:09 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776822789.6017933, auto_measure=True +(EngineCore pid=244) DEBUG 04-22 01:53:09 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=244) DEBUG 04-22 01:53:09 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=244) DEBUG 04-22 01:53:09 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=244) DEBUG 04-22 01:53:09 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=244) DEBUG 04-22 01:53:09 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=244) DEBUG 04-22 01:53:09 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=244) DEBUG 04-22 01:53:09 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=244) DEBUG 04-22 01:53:09 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=244) INFO 04-22 01:53:09 [v1/worker/gpu_model_runner.py:4735] Starting to load model Qwen/Qwen2.5-7B-Instruct... +(EngineCore pid=244) DEBUG 04-22 01:53:10 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=244) INFO 04-22 01:53:10 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=244) INFO 04-22 01:53:10 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=244) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=244) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=244) DEBUG 04-22 01:53:10 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=244) DEBUG 04-22 01:53:10 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=244) DEBUG 04-22 01:53:10 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 57, 'silu_and_mul': 28, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=244) DEBUG 04-22 01:53:10 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=244) DEBUG 04-22 01:53:10 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00001-of-00004.safetensors', 'model-00002-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00004-of-00004.safetensors']] +(EngineCore pid=244) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=244) INFO 04-22 01:53:22 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/517af7d22e/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=9fd3f9070d comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/517af7d22e/rank_0_0/backbone +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] Vllm config hash: 9fd3f9070d +(EngineCore pid=244) INFO 04-22 01:53:22 [compilation/backends.py:1111] Dynamo bytecode transform time: 3.60 s +(EngineCore pid=244) DEBUG 04-22 01:53:23 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=244) DEBUG 04-22 01:53:23 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=244) DEBUG 04-22 01:53:23 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=244) DEBUG 04-22 01:53:23 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(EngineCore pid=244) DEBUG 04-22 01:53:23 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 3.7 ms +(EngineCore pid=244) DEBUG 04-22 01:53:23 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=244) DEBUG 04-22 01:53:23 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=244) INFO 04-22 01:53:25 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=244) DEBUG 04-22 01:53:25 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/517af7d22e/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=244) DEBUG 04-22 01:53:26 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=244) DEBUG 04-22 01:53:26 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(EngineCore pid=244) DEBUG 04-22 01:53:26 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.3 ms +(EngineCore pid=244) DEBUG 04-22 01:53:26 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=244) DEBUG 04-22 01:53:26 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=244) DEBUG 04-22 01:53:27 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/517af7d22e/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=244) DEBUG 04-22 01:53:28 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=244) DEBUG 04-22 01:53:28 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.0 ms +(EngineCore pid=244) DEBUG 04-22 01:53:28 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms +(EngineCore pid=244) DEBUG 04-22 01:53:28 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=244) DEBUG 04-22 01:53:28 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(APIServer pid=1) DEBUG 04-22 01:53:28 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=244) DEBUG 04-22 01:53:28 [compilation/backends.py:377] Store the 28-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_28', '/data/.cache/vllm/torch_compile_cache/517af7d22e/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_28') +(EngineCore pid=244) INFO 04-22 01:53:28 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.20 s +(EngineCore pid=244) DEBUG 04-22 01:53:28 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/517af7d22e/rank_0_0/backbone/computation_graph.py +(EngineCore pid=244) INFO 04-22 01:53:29 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/af475fe7eb320dca54576c329c21f5832d9a8ff077be11492d01fe7d66d5502a/rank_0_0/model +(EngineCore pid=244) INFO 04-22 01:53:29 [compilation/monitor.py:48] torch.compile took 10.53 s in total +(EngineCore pid=244) INFO 04-22 01:53:30 [compilation/monitor.py:76] Initial profiling/warmup run took 0.62 s +(EngineCore pid=244) INFO 04-22 01:53:35 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=244) DEBUG 04-22 01:53:35 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=244) INFO 04-22 01:53:35 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=244) DEBUG 04-22 01:53:36 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 01:53:36 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 01:53:36 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-22 01:53:36 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 01:53:36 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 01:53:36 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-22 01:53:36 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 160.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(EngineCore pid=244) DEBUG 04-22 01:53:36 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 01:53:36 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 01:53:36 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-22 01:53:36 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 01:53:36 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 01:53:36 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-22 01:53:36 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 228.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(EngineCore pid=244) DEBUG 04-22 01:53:36 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=244) INFO 04-22 01:53:36 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.61 GiB total +(EngineCore pid=244) DEBUG 04-22 01:53:37 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=244) DEBUG 04-22 01:53:37 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.98 GiB (total), 59.53 GiB (within requested) +(EngineCore pid=244) DEBUG 04-22 01:53:37 [v1/worker/gpu_worker.py:435] Memory profiling takes 17.87 seconds. Total non KV cache memory: 16.7GiB; torch peak memory increase: 2.21GiB; non-torch forward increase memory: 0.24GiB; weights memory: 14.25GiB. +(EngineCore pid=244) INFO 04-22 01:53:37 [v1/worker/gpu_worker.py:436] Available KV cache memory: 58.53 GiB +(EngineCore pid=244) INFO 04-22 01:53:37 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9577 to maintain the same effective KV cache size. +(EngineCore pid=244) INFO 04-22 01:53:37 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 1,096,000 tokens +(EngineCore pid=244) INFO 04-22 01:53:37 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 32,768 tokens per request: 33.45x +(EngineCore pid=244) 2026-04-22 01:53:37,067 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=244) DEBUG 04-22 01:53:37 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) 2026-04-22 01:53:37,075 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=244) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:49:25 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:49:25 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 01:49:26 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:49:26 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 01:49:26 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 01:49:26 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model Qwen/Qwen2.5-7B-Instruct +(APIServer pid=1) INFO 04-22 01:49:26 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 01:49:26 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:49:26 [entrypoints/utils.py:233] non-default args: {'model_tag': 'Qwen/Qwen2.5-7B-Instruct', 'model': 'Qwen/Qwen2.5-7B-Instruct', 'max_model_len': 2048, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 01:49:26 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 01:49:26 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen2.Qwen2ForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 01:49:26 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0023413 secs +(APIServer pid=1) INFO 04-22 01:49:26 [config/model.py:549] Resolved architecture: Qwen2ForCausalLM +(APIServer pid=1) INFO 04-22 01:49:26 [config/model.py:1678] Using max model len 2048 +(APIServer pid=1) DEBUG 04-22 01:49:26 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 01:49:26 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 01:49:26 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 01:49:26 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 01:49:26 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 01:49:26 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 01:49:26 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 01:49:26 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 01:49:26 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:49:27 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:49:27 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 01:49:30 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:49:30 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:49:30 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:49:30 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:49:30 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:49:30 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:49:30 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:49:30 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:49:30 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:49:30 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:49:30 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:49:30 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:49:35 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=243) DEBUG 04-22 01:49:36 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 01:49:36 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=243) DEBUG 04-22 01:49:36 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/a61479ac-38ea-47e6-b2c3-71d688d66295'], outputs=['ipc:///tmp/815053a2-58ee-414b-8fcf-cb5d3eabb9a5'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=243) DEBUG 04-22 01:49:36 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=243) DEBUG 04-22 01:49:36 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=243) DEBUG 04-22 01:49:36 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=243) DEBUG 04-22 01:49:36 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=243) DEBUG 04-22 01:49:36 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=243) INFO 04-22 01:49:36 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='Qwen/Qwen2.5-7B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-7B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen2.5-7B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=243) DEBUG 04-22 01:49:37 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.39:33129 backend=nccl +(EngineCore pid=243) INFO 04-22 01:49:37 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.39:33129 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=243) DEBUG 04-22 01:49:37 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=243) INFO 04-22 01:49:37 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=243) DEBUG 04-22 01:49:37 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776822577.8015513, auto_measure=True +(EngineCore pid=243) DEBUG 04-22 01:49:37 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=243) DEBUG 04-22 01:49:37 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=243) DEBUG 04-22 01:49:37 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=243) DEBUG 04-22 01:49:37 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=243) DEBUG 04-22 01:49:37 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=243) DEBUG 04-22 01:49:37 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=243) DEBUG 04-22 01:49:37 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=243) DEBUG 04-22 01:49:37 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=243) INFO 04-22 01:49:38 [v1/worker/gpu_model_runner.py:4735] Starting to load model Qwen/Qwen2.5-7B-Instruct... +(EngineCore pid=243) DEBUG 04-22 01:49:38 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=243) INFO 04-22 01:49:38 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=243) INFO 04-22 01:49:38 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=243) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=243) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=243) DEBUG 04-22 01:49:38 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=243) DEBUG 04-22 01:49:38 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=243) DEBUG 04-22 01:49:38 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 57, 'silu_and_mul': 28, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=243) DEBUG 04-22 01:49:38 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=243) DEBUG 04-22 01:49:38 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00001-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00002-of-00004.safetensors', 'model-00003-of-00004.safetensors']] +(EngineCore pid=243) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=243) INFO 04-22 01:49:56 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/c1a43ecc55/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=a0c7a93070 comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/c1a43ecc55/rank_0_0/backbone +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] Vllm config hash: a0c7a93070 +(EngineCore pid=243) INFO 04-22 01:49:56 [compilation/backends.py:1111] Dynamo bytecode transform time: 3.53 s +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(APIServer pid=1) DEBUG 04-22 01:49:56 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=243) DEBUG 04-22 01:49:57 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 01:49:57 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms +(EngineCore pid=243) DEBUG 04-22 01:49:57 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.7 ms +(EngineCore pid=243) DEBUG 04-22 01:49:57 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 01:49:57 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=243) INFO 04-22 01:49:59 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=243) DEBUG 04-22 01:49:59 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/c1a43ecc55/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=243) DEBUG 04-22 01:49:59 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 01:49:59 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(EngineCore pid=243) DEBUG 04-22 01:49:59 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms +(EngineCore pid=243) DEBUG 04-22 01:49:59 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 01:49:59 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=243) DEBUG 04-22 01:50:00 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/c1a43ecc55/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=243) DEBUG 04-22 01:50:01 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 01:50:01 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms +(EngineCore pid=243) DEBUG 04-22 01:50:01 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms +(EngineCore pid=243) DEBUG 04-22 01:50:01 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 01:50:01 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(EngineCore pid=243) DEBUG 04-22 01:50:01 [compilation/backends.py:377] Store the 28-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_28', '/data/.cache/vllm/torch_compile_cache/c1a43ecc55/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_28') +(EngineCore pid=243) INFO 04-22 01:50:01 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.16 s +(EngineCore pid=243) DEBUG 04-22 01:50:02 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/c1a43ecc55/rank_0_0/backbone/computation_graph.py +(EngineCore pid=243) INFO 04-22 01:50:03 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/2b9e7b7e03218b45968c79fc3d87056325d0376cdeef027b60de5c4be6eeb4c9/rank_0_0/model +(EngineCore pid=243) INFO 04-22 01:50:03 [compilation/monitor.py:48] torch.compile took 10.32 s in total +(EngineCore pid=243) INFO 04-22 01:50:03 [compilation/monitor.py:76] Initial profiling/warmup run took 0.43 s +(APIServer pid=1) DEBUG 04-22 01:50:06 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=243) INFO 04-22 01:50:09 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=243) DEBUG 04-22 01:50:09 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=243) INFO 04-22 01:50:09 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=243) DEBUG 04-22 01:50:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:50:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:50:09 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 01:50:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:50:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:50:09 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 01:50:09 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 160.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(EngineCore pid=243) DEBUG 04-22 01:50:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:50:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:50:09 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 01:50:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:50:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:50:09 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 01:50:09 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 228.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(EngineCore pid=243) DEBUG 04-22 01:50:09 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=243) INFO 04-22 01:50:09 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.61 GiB total +(EngineCore pid=243) DEBUG 04-22 01:50:10 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=243) DEBUG 04-22 01:50:10 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.98 GiB (total), 59.53 GiB (within requested) +(EngineCore pid=243) DEBUG 04-22 01:50:10 [v1/worker/gpu_worker.py:435] Memory profiling takes 17.42 seconds. Total non KV cache memory: 16.7GiB; torch peak memory increase: 2.21GiB; non-torch forward increase memory: 0.24GiB; weights memory: 14.25GiB. +(EngineCore pid=243) INFO 04-22 01:50:10 [v1/worker/gpu_worker.py:436] Available KV cache memory: 58.53 GiB +(EngineCore pid=243) INFO 04-22 01:50:10 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9577 to maintain the same effective KV cache size. +(EngineCore pid=243) INFO 04-22 01:50:10 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 1,096,000 tokens +(EngineCore pid=243) INFO 04-22 01:50:10 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 2,048 tokens per request: 535.16x +(EngineCore pid=243) 2026-04-22 01:50:10,204 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=243) DEBUG 04-22 01:50:10 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) 2026-04-22 01:50:10,212 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=243) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:50:36 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:50:36 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 01:50:36 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:50:36 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 01:50:36 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 01:50:36 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model Qwen/Qwen2.5-7B-Instruct +(APIServer pid=1) INFO 04-22 01:50:36 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 01:50:36 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:50:36 [entrypoints/utils.py:233] non-default args: {'model_tag': 'Qwen/Qwen2.5-7B-Instruct', 'model': 'Qwen/Qwen2.5-7B-Instruct', 'max_model_len': 4096, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 01:50:36 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 01:50:37 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen2.Qwen2ForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 01:50:37 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0011555 secs +(APIServer pid=1) INFO 04-22 01:50:37 [config/model.py:549] Resolved architecture: Qwen2ForCausalLM +(APIServer pid=1) INFO 04-22 01:50:37 [config/model.py:1678] Using max model len 4096 +(APIServer pid=1) DEBUG 04-22 01:50:37 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 01:50:37 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 01:50:37 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 01:50:37 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 01:50:37 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 01:50:37 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 01:50:37 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 01:50:37 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 01:50:37 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:50:37 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:50:37 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 01:50:41 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:50:41 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:50:41 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:50:41 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:50:41 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:50:41 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:50:41 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:50:41 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:50:41 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:50:41 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:50:41 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:50:41 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:50:46 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=244) DEBUG 04-22 01:50:47 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 01:50:47 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=244) DEBUG 04-22 01:50:47 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/f918c33e-6b86-4f3c-9f2f-18041540ee46'], outputs=['ipc:///tmp/f05cb364-a727-4fa2-9e67-4ac7d5cd3acf'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=244) DEBUG 04-22 01:50:47 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=244) DEBUG 04-22 01:50:47 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=244) DEBUG 04-22 01:50:47 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=244) DEBUG 04-22 01:50:47 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=244) DEBUG 04-22 01:50:47 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=244) INFO 04-22 01:50:47 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='Qwen/Qwen2.5-7B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-7B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen2.5-7B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=244) DEBUG 04-22 01:50:48 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.40:53241 backend=nccl +(EngineCore pid=244) INFO 04-22 01:50:48 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.40:53241 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=244) DEBUG 04-22 01:50:48 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=244) INFO 04-22 01:50:48 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=244) DEBUG 04-22 01:50:48 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776822648.5352857, auto_measure=True +(EngineCore pid=244) DEBUG 04-22 01:50:48 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=244) DEBUG 04-22 01:50:48 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=244) DEBUG 04-22 01:50:48 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=244) DEBUG 04-22 01:50:48 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=244) DEBUG 04-22 01:50:48 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=244) DEBUG 04-22 01:50:48 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=244) DEBUG 04-22 01:50:48 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=244) DEBUG 04-22 01:50:48 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=244) INFO 04-22 01:50:48 [v1/worker/gpu_model_runner.py:4735] Starting to load model Qwen/Qwen2.5-7B-Instruct... +(EngineCore pid=244) DEBUG 04-22 01:50:49 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=244) INFO 04-22 01:50:49 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=244) INFO 04-22 01:50:49 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=244) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=244) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=244) DEBUG 04-22 01:50:49 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=244) DEBUG 04-22 01:50:49 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=244) DEBUG 04-22 01:50:49 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 57, 'silu_and_mul': 28, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=244) DEBUG 04-22 01:50:49 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=244) DEBUG 04-22 01:50:49 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00004-of-00004.safetensors', 'model-00002-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00001-of-00004.safetensors']] +(EngineCore pid=244) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 +(APIServer pid=1) DEBUG 04-22 01:50:57 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=244) INFO 04-22 01:50:59 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/e690b99c41/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=9f1e1a0fca comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/e690b99c41/rank_0_0/backbone +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] Vllm config hash: 9f1e1a0fca +(EngineCore pid=244) INFO 04-22 01:50:59 [compilation/backends.py:1111] Dynamo bytecode transform time: 3.53 s +(EngineCore pid=244) DEBUG 04-22 01:51:00 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=244) DEBUG 04-22 01:51:00 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=244) DEBUG 04-22 01:51:00 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=244) DEBUG 04-22 01:51:00 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms +(EngineCore pid=244) DEBUG 04-22 01:51:00 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms +(EngineCore pid=244) DEBUG 04-22 01:51:00 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=244) DEBUG 04-22 01:51:00 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=244) INFO 04-22 01:51:02 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=244) DEBUG 04-22 01:51:02 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/e690b99c41/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=244) DEBUG 04-22 01:51:02 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=244) DEBUG 04-22 01:51:02 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(EngineCore pid=244) DEBUG 04-22 01:51:02 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms +(EngineCore pid=244) DEBUG 04-22 01:51:02 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=244) DEBUG 04-22 01:51:02 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=244) DEBUG 04-22 01:51:04 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/e690b99c41/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=244) DEBUG 04-22 01:51:04 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=244) DEBUG 04-22 01:51:04 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms +(EngineCore pid=244) DEBUG 04-22 01:51:04 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms +(EngineCore pid=244) DEBUG 04-22 01:51:04 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=244) DEBUG 04-22 01:51:04 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(EngineCore pid=244) DEBUG 04-22 01:51:05 [compilation/backends.py:377] Store the 28-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_28', '/data/.cache/vllm/torch_compile_cache/e690b99c41/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_28') +(EngineCore pid=244) INFO 04-22 01:51:05 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.10 s +(EngineCore pid=244) DEBUG 04-22 01:51:05 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/e690b99c41/rank_0_0/backbone/computation_graph.py +(EngineCore pid=244) INFO 04-22 01:51:06 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/48014cda5f34d6b24aefcc31b0f02d5d7bedc9855e095468b3e5a530b56177cf/rank_0_0/model +(EngineCore pid=244) INFO 04-22 01:51:06 [compilation/monitor.py:48] torch.compile took 10.13 s in total +(EngineCore pid=244) INFO 04-22 01:51:06 [compilation/monitor.py:76] Initial profiling/warmup run took 0.43 s +(APIServer pid=1) DEBUG 04-22 01:51:07 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=244) INFO 04-22 01:51:12 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=244) DEBUG 04-22 01:51:12 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=244) INFO 04-22 01:51:12 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=244) DEBUG 04-22 01:51:12 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 01:51:12 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 01:51:12 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-22 01:51:12 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 01:51:12 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 01:51:12 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-22 01:51:12 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 160.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(EngineCore pid=244) DEBUG 04-22 01:51:12 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 01:51:12 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 01:51:12 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-22 01:51:12 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 01:51:12 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 01:51:12 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-22 01:51:12 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 228.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(EngineCore pid=244) DEBUG 04-22 01:51:13 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=244) INFO 04-22 01:51:13 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.61 GiB total +(EngineCore pid=244) DEBUG 04-22 01:51:13 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=244) DEBUG 04-22 01:51:13 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.98 GiB (total), 59.53 GiB (within requested) +(EngineCore pid=244) DEBUG 04-22 01:51:13 [v1/worker/gpu_worker.py:435] Memory profiling takes 16.99 seconds. Total non KV cache memory: 16.7GiB; torch peak memory increase: 2.21GiB; non-torch forward increase memory: 0.24GiB; weights memory: 14.25GiB. +(EngineCore pid=244) INFO 04-22 01:51:13 [v1/worker/gpu_worker.py:436] Available KV cache memory: 58.53 GiB +(EngineCore pid=244) INFO 04-22 01:51:13 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9577 to maintain the same effective KV cache size. +(EngineCore pid=244) INFO 04-22 01:51:13 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 1,096,000 tokens +(EngineCore pid=244) INFO 04-22 01:51:13 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 4,096 tokens per request: 267.58x +(EngineCore pid=244) 2026-04-22 01:51:13,316 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=244) DEBUG 04-22 01:51:13 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) 2026-04-22 01:51:13,324 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=244) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:00:18 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:00:18 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 01:00:19 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:00:19 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 01:00:19 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 01:00:19 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model Qwen/Qwen2.5-7B-Instruct +(APIServer pid=1) INFO 04-22 01:00:19 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 01:00:19 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:00:19 [entrypoints/utils.py:233] non-default args: {'model_tag': 'Qwen/Qwen2.5-7B-Instruct', 'model': 'Qwen/Qwen2.5-7B-Instruct', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 01:00:19 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 01:00:19 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen2.Qwen2ForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 01:00:19 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0004101 secs +(APIServer pid=1) INFO 04-22 01:00:19 [config/model.py:549] Resolved architecture: Qwen2ForCausalLM +(APIServer pid=1) INFO 04-22 01:00:19 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 01:00:19 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 01:00:19 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 01:00:19 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 01:00:19 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 01:00:19 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 01:00:19 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 01:00:19 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 01:00:19 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 01:00:19 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:00:19 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:00:19 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 01:00:23 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:00:23 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:00:23 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:00:23 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:00:23 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:00:23 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:00:23 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:00:23 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:00:23 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:00:23 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:00:23 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:00:23 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:00:28 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=242) DEBUG 04-22 01:00:29 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 01:00:29 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=242) DEBUG 04-22 01:00:29 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/70fbe184-2f30-4d0d-80e0-3c9f4a4363a7'], outputs=['ipc:///tmp/3deaa7ca-dacc-495c-bff0-288f35c7b679'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=242) DEBUG 04-22 01:00:29 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=242) DEBUG 04-22 01:00:29 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=242) DEBUG 04-22 01:00:29 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=242) DEBUG 04-22 01:00:29 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=242) DEBUG 04-22 01:00:29 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=242) INFO 04-22 01:00:29 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='Qwen/Qwen2.5-7B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-7B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen2.5-7B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=242) DEBUG 04-22 01:00:30 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.131.7.84:48247 backend=nccl +(EngineCore pid=242) INFO 04-22 01:00:30 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.131.7.84:48247 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=242) DEBUG 04-22 01:00:30 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=242) INFO 04-22 01:00:30 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=242) DEBUG 04-22 01:00:30 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776819630.9128685, auto_measure=True +(EngineCore pid=242) DEBUG 04-22 01:00:30 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=242) DEBUG 04-22 01:00:31 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=242) DEBUG 04-22 01:00:31 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=242) DEBUG 04-22 01:00:31 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=242) DEBUG 04-22 01:00:31 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=242) DEBUG 04-22 01:00:31 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=242) DEBUG 04-22 01:00:31 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=242) DEBUG 04-22 01:00:31 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=242) INFO 04-22 01:00:31 [v1/worker/gpu_model_runner.py:4735] Starting to load model Qwen/Qwen2.5-7B-Instruct... +(EngineCore pid=242) DEBUG 04-22 01:00:31 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=242) INFO 04-22 01:00:31 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=242) INFO 04-22 01:00:31 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=242) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=242) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=242) DEBUG 04-22 01:00:31 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=242) DEBUG 04-22 01:00:31 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=242) DEBUG 04-22 01:00:31 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 57, 'silu_and_mul': 28, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=242) DEBUG 04-22 01:00:31 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=242) DEBUG 04-22 01:00:32 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00001-of-00004.safetensors']] +(EngineCore pid=242) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:38:34 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:38:34 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 01:38:34 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:38:34 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 01:38:34 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 01:38:34 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model Qwen/Qwen2.5-7B-Instruct +(APIServer pid=1) INFO 04-22 01:38:34 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 01:38:34 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:38:34 [entrypoints/utils.py:233] non-default args: {'model_tag': 'Qwen/Qwen2.5-7B-Instruct', 'model': 'Qwen/Qwen2.5-7B-Instruct', 'max_model_len': 8192, 'tensor_parallel_size': 2, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 01:38:34 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 01:38:35 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen2.Qwen2ForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 01:38:35 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0025655 secs +(APIServer pid=1) INFO 04-22 01:38:35 [config/model.py:549] Resolved architecture: Qwen2ForCausalLM +(APIServer pid=1) INFO 04-22 01:38:35 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 01:38:35 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 01:38:35 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 01:38:35 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 01:38:35 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 01:38:35 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 01:38:35 [config/parallel.py:743] Defaulting to use mp for distributed inference +(APIServer pid=1) INFO 04-22 01:38:35 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 01:38:35 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(APIServer pid=1) DEBUG 04-22 01:38:37 [config/vllm.py:1530] Max num batched tokens below allreduce-rms fusion threshold, allreduce-rms fusion will be enabled for all num_tokens. +(APIServer pid=1) INFO 04-22 01:38:37 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(APIServer pid=1) DEBUG 04-22 01:38:37 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 01:38:37 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:38:38 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:38:38 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 01:38:41 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:38:41 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:38:41 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:38:41 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:38:41 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:38:41 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:38:41 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:38:41 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:38:41 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:38:41 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:38:41 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:38:41 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:38:46 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=242) DEBUG 04-22 01:38:48 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 01:38:48 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=242) DEBUG 04-22 01:38:48 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/a9aaebbc-67b3-48c3-bb08-891f4fe6d87b'], outputs=['ipc:///tmp/7d27bee5-a979-4651-a995-18123120cd09'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=242) DEBUG 04-22 01:38:48 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=242) DEBUG 04-22 01:38:48 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=242) DEBUG 04-22 01:38:48 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=242) DEBUG 04-22 01:38:48 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=242) DEBUG 04-22 01:38:48 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=242) INFO 04-22 01:38:48 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='Qwen/Qwen2.5-7B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-7B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen2.5-7B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=242) WARNING 04-22 01:38:48 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. +(EngineCore pid=242) INFO 04-22 01:38:48 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.129.9.33 (local), world_size=2, local_world_size=2 +(EngineCore pid=242) DEBUG 04-22 01:38:48 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/5d89bb73-e1b7-4ead-af9d-0d069eb0cbe8 +(EngineCore pid=242) DEBUG 04-22 01:38:48 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1], buffer_handle=(2, 16777216, 10, 'psm_1b7e4d47'), local_subscribe_addr='ipc:///tmp/5d89bb73-e1b7-4ead-af9d-0d069eb0cbe8', local_notify_addr='ipc:///tmp/17f21fec-15b6-4793-af53-6da7e4ec48d3', remote_subscribe_addr=None, remote_addr_ipv6=False) +DEBUG 04-22 01:38:51 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:38:51 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:38:51 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:38:51 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:38:51 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:38:51 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:38:51 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:38:51 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:38:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:38:51 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:38:51 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:38:51 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:38:51 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:38:51 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:38:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:38:51 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:38:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:38:51 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:38:51 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:38:51 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:38:51 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:38:51 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:38:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:38:51 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:38:56 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:38:56 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:38:57 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:38:57 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:38:57 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:38:57 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 01:38:58 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:38:58 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:38:58 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:38:58 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) DEBUG 04-22 01:38:58 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker pid=441) DEBUG 04-22 01:38:58 [distributed/parallel_state.py:1356] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:40119 backend=nccl +(Worker pid=441) INFO 04-22 01:38:58 [distributed/parallel_state.py:1400] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:40119 backend=nccl +(Worker pid=442) DEBUG 04-22 01:38:58 [distributed/parallel_state.py:1356] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:40119 backend=nccl +(Worker pid=442) INFO 04-22 01:38:58 [distributed/parallel_state.py:1400] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:40119 backend=nccl +[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +(Worker pid=442) DEBUG 04-22 01:38:58 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=441) DEBUG 04-22 01:38:58 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +(Worker pid=441) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=442) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=441) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=442) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=441) DEBUG 04-22 01:38:58 [utils/nccl.py:34] Found nccl from library libnccl.so.2 +(Worker pid=441) INFO 04-22 01:38:58 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 +(Worker pid=442) DEBUG 04-22 01:38:59 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=441) DEBUG 04-22 01:38:59 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=441) DEBUG 04-22 01:38:59 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/9635c6e6-7573-4cb8-ba15-e9e4de69f58e +(Worker pid=441) DEBUG 04-22 01:38:59 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1], buffer_handle=(1, 4194304, 6, 'psm_233a6472'), local_subscribe_addr='ipc:///tmp/9635c6e6-7573-4cb8-ba15-e9e4de69f58e', local_notify_addr='ipc:///tmp/d4903846-6b2f-4ebf-8b4a-4838bed61d2e', remote_subscribe_addr=None, remote_addr_ipv6=False) +(Worker pid=442) DEBUG 04-22 01:38:59 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/9635c6e6-7573-4cb8-ba15-e9e4de69f58e +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(Worker pid=441) INFO 04-22 01:38:59 [distributed/parallel_state.py:1716] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(Worker pid=441) DEBUG 04-22 01:38:59 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.66GiB, total_memory=79.19GiB, cuda_memory=1.53GiB, torch_memory=0.02GiB, non_torch_memory=1.51GiB, timestamp=1776821939.8407264, auto_measure=True +(Worker pid=441) DEBUG 04-22 01:38:59 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=442) DEBUG 04-22 01:38:59 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.66GiB, total_memory=79.19GiB, cuda_memory=1.53GiB, torch_memory=0.02GiB, non_torch_memory=1.51GiB, timestamp=1776821939.8653443, auto_measure=True +(Worker pid=442) DEBUG 04-22 01:38:59 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=441) DEBUG 04-22 01:38:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=441) DEBUG 04-22 01:38:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=441) DEBUG 04-22 01:38:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=442) DEBUG 04-22 01:38:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=442) DEBUG 04-22 01:38:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=442) DEBUG 04-22 01:38:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=441) DEBUG 04-22 01:38:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=441) DEBUG 04-22 01:38:59 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(Worker pid=441) DEBUG 04-22 01:39:00 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=442) DEBUG 04-22 01:39:00 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=442) DEBUG 04-22 01:39:00 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=441) DEBUG 04-22 01:39:00 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(Worker_TP0 pid=441) INFO 04-22 01:39:00 [v1/worker/gpu_model_runner.py:4735] Starting to load model Qwen/Qwen2.5-7B-Instruct... +(Worker_TP0 pid=441) DEBUG 04-22 01:39:00 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(Worker_TP0 pid=441) INFO 04-22 01:39:00 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(Worker_TP0 pid=441) INFO 04-22 01:39:00 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(Worker_TP0 pid=441) DEBUG 04-22 01:39:00 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP0 pid=441) DEBUG 04-22 01:39:00 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP0 pid=441) DEBUG 04-22 01:39:00 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 57, 'silu_and_mul': 28, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP0 pid=441) DEBUG 04-22 01:39:00 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP1 pid=442) DEBUG 04-22 01:39:00 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP1 pid=442) DEBUG 04-22 01:39:00 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP1 pid=442) DEBUG 04-22 01:39:00 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 57, 'silu_and_mul': 28, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP1 pid=442) DEBUG 04-22 01:39:00 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP1 pid=442) DEBUG 04-22 01:39:00 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00003-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'model-00002-of-00004.safetensors']] +(Worker_TP0 pid=441) DEBUG 04-22 01:39:00 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00004-of-00004.safetensors', 'model-00002-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00001-of-00004.safetensors']] +(Worker_TP0 pid=441) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 +(Worker_TP1 pid=442) DEBUG 04-22 01:39:12 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=442) DEBUG 04-22 01:39:12 [compilation/decorators.py:528] Start compiling function +(EngineCore pid=242) DEBUG 04-22 01:39:13 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=eb5300d232 comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/2a701ed6c9/rank_1_0/backbone +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] Vllm config hash: eb5300d232 +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP0 pid=441) INFO 04-22 01:39:16 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/2a701ed6c9/rank_0_0/backbone for vLLM's torch.compile +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=eb5300d232 comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/2a701ed6c9/rank_0_0/backbone +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] Vllm config hash: eb5300d232 +(Worker_TP0 pid=441) INFO 04-22 01:39:16 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.20 s +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/.../fusion/allreduce_rms_fusion.py:765] Flashinfer max size: 64 MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: 8192 +(Worker_TP0 pid=441) INFO 04-22 01:39:16 [distributed/device_communicators/flashinfer_all_reduce.py:109] Auto-selected flashinfer allreduce backend: trtllm +(Worker_TP0 pid=441) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. +(Worker_TP0 pid=441) return func(*args, **kwargs) +(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=0, max_token_num=8192, hidden_dim=3584, dtype=torch.bfloat16 +(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=1, max_token_num=8192, hidden_dim=3584, dtype=torch.bfloat16 +(Worker_TP0 pid=441) INFO 04-22 01:39:16 [distributed/device_communicators/flashinfer_all_reduce.py:149] Initialized FlashInfer Allreduce norm fusion workspace with backend=trtllm +(Worker_TP0 pid=441) DEBUG 04-22 01:39:17 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(Worker_TP0 pid=441) DEBUG 04-22 01:39:17 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(Worker_TP1 pid=442) DEBUG 04-22 01:39:17 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=442) DEBUG 04-22 01:39:17 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP0 pid=441) DEBUG 04-22 01:39:17 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=441) DEBUG 04-22 01:39:17 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP1 pid=442) DEBUG 04-22 01:39:17 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns +(Worker_TP1 pid=442) DEBUG 04-22 01:39:17 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 38.0 ms +(Worker_TP1 pid=442) DEBUG 04-22 01:39:17 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=442) DEBUG 04-22 01:39:17 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes +(Worker_TP1 pid=442) DEBUG 04-22 01:39:17 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=441) DEBUG 04-22 01:39:17 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns +(Worker_TP0 pid=441) DEBUG 04-22 01:39:17 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 38.5 ms +(Worker_TP0 pid=441) DEBUG 04-22 01:39:17 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=441) DEBUG 04-22 01:39:17 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes +(Worker_TP0 pid=441) DEBUG 04-22 01:39:17 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(APIServer pid=1) DEBUG 04-22 01:39:18 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=441) INFO 04-22 01:39:20 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(Worker_TP0 pid=441) DEBUG 04-22 01:39:20 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/2a701ed6c9/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(Worker_TP1 pid=442) DEBUG 04-22 01:39:20 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=442) DEBUG 04-22 01:39:20 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP1 pid=442) DEBUG 04-22 01:39:20 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP1 pid=442) DEBUG 04-22 01:39:20 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 66.5 ms +(Worker_TP1 pid=442) DEBUG 04-22 01:39:20 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=442) DEBUG 04-22 01:39:20 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP1 pid=442) DEBUG 04-22 01:39:20 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP0 pid=441) DEBUG 04-22 01:39:21 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=441) DEBUG 04-22 01:39:21 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP0 pid=441) DEBUG 04-22 01:39:21 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP0 pid=441) DEBUG 04-22 01:39:21 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 63.9 ms +(Worker_TP0 pid=441) DEBUG 04-22 01:39:21 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=441) DEBUG 04-22 01:39:21 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP0 pid=441) DEBUG 04-22 01:39:21 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP0 pid=441) DEBUG 04-22 01:39:21 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/2a701ed6c9/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(Worker_TP1 pid=442) DEBUG 04-22 01:39:22 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=442) DEBUG 04-22 01:39:22 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP1 pid=442) DEBUG 04-22 01:39:22 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP1 pid=442) DEBUG 04-22 01:39:22 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 63.3 ms +(Worker_TP1 pid=442) DEBUG 04-22 01:39:22 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP1 pid=442) DEBUG 04-22 01:39:22 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP1 pid=442) DEBUG 04-22 01:39:22 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=441) DEBUG 04-22 01:39:22 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=441) DEBUG 04-22 01:39:22 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP0 pid=441) DEBUG 04-22 01:39:23 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP0 pid=441) DEBUG 04-22 01:39:23 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 479.4 ms +(Worker_TP0 pid=441) DEBUG 04-22 01:39:23 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP0 pid=441) DEBUG 04-22 01:39:23 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP0 pid=441) DEBUG 04-22 01:39:23 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=441) DEBUG 04-22 01:39:23 [compilation/backends.py:377] Store the 28-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_28', '/data/.cache/vllm/torch_compile_cache/2a701ed6c9/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_28') +(Worker_TP0 pid=441) INFO 04-22 01:39:23 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 6.23 s +(Worker_TP0 pid=441) DEBUG 04-22 01:39:23 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/2a701ed6c9/rank_0_0/backbone/computation_graph.py +(Worker_TP0 pid=441) INFO 04-22 01:39:24 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/aef92f39fb2a63603cb9c85df96429df4227a73756dec1d4158c61e06d29d35f/rank_0_0/model +(Worker_TP0 pid=441) INFO 04-22 01:39:24 [compilation/monitor.py:48] torch.compile took 11.64 s in total +(Worker_TP0 pid=441) INFO 04-22 01:39:24 [compilation/monitor.py:76] Initial profiling/warmup run took 0.18 s +(APIServer pid=1) DEBUG 04-22 01:39:28 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP1 pid=442) INFO 04-22 01:39:30 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP1 pid=442) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP1 pid=442) INFO 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_TP0 pid=441) INFO 04-22 01:39:30 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP0 pid=441) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP0 pid=441) INFO 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_TP1 pid=442) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=441) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=442) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=442) DEBUG 04-22 01:39:30 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=441) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=441) DEBUG 04-22 01:39:30 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=442) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=441) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=442) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=442) DEBUG 04-22 01:39:30 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=441) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=441) DEBUG 04-22 01:39:30 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=442) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 122.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(Worker_TP1 pid=442) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=441) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 122.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(Worker_TP0 pid=441) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=442) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=442) DEBUG 04-22 01:39:30 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=441) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=441) DEBUG 04-22 01:39:30 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=441) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=442) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=441) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=441) DEBUG 04-22 01:39:30 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=442) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=442) DEBUG 04-22 01:39:30 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=441) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 116.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(Worker_TP0 pid=441) INFO 04-22 01:39:30 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses +(Worker_TP1 pid=442) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 116.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(Worker_TP1 pid=442) INFO 04-22 01:39:30 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses +(Worker_TP0 pid=441) DEBUG 04-22 01:39:31 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP0 pid=441) INFO 04-22 01:39:31 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.61 GiB total +(Worker_TP1 pid=442) DEBUG 04-22 01:39:31 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP1 pid=442) INFO 04-22 01:39:31 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.61 GiB total +(Worker_TP0 pid=441) DEBUG 04-22 01:39:31 [v1/worker/gpu_worker.py:424] Initial free memory: 77.66 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP0 pid=441) DEBUG 04-22 01:39:31 [v1/worker/gpu_worker.py:430] Free memory after profiling: 68.1 GiB (total), 65.68 GiB (within requested) +(Worker_TP0 pid=441) DEBUG 04-22 01:39:31 [v1/worker/gpu_worker.py:435] Memory profiling takes 18.94 seconds. Total non KV cache memory: 11.39GiB; torch peak memory increase: 2.21GiB; non-torch forward increase memory: 2.06GiB; weights memory: 7.12GiB. +(Worker_TP0 pid=441) INFO 04-22 01:39:31 [v1/worker/gpu_worker.py:436] Available KV cache memory: 63.84 GiB +(Worker_TP0 pid=441) INFO 04-22 01:39:31 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9577 to maintain the same effective KV cache size. +(Worker_TP0 pid=441) DEBUG 04-22 01:39:31 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=242) DEBUG 04-22 01:39:31 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=242) DEBUG 04-22 01:39:31 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=442) DEBUG 04-22 01:39:31 [v1/worker/gpu_worker.py:424] Initial free memory: 77.66 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP1 pid=442) DEBUG 04-22 01:39:31 [v1/worker/gpu_worker.py:430] Free memory after profiling: 68.1 GiB (total), 65.68 GiB (within requested) +(Worker_TP1 pid=442) DEBUG 04-22 01:39:31 [v1/worker/gpu_worker.py:435] Memory profiling takes 18.93 seconds. Total non KV cache memory: 11.39GiB; torch peak memory increase: 2.21GiB; non-torch forward increase memory: 2.06GiB; weights memory: 7.12GiB. +(Worker_TP1 pid=442) INFO 04-22 01:39:31 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9577 to maintain the same effective KV cache size. +(Worker_TP1 pid=442) DEBUG 04-22 01:39:31 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=242) DEBUG 04-22 01:39:31 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=242) INFO 04-22 01:39:31 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 2,390,592 tokens +(EngineCore pid=242) INFO 04-22 01:39:31 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 291.82x +(Worker_TP1 pid=442) DEBUG 04-22 01:39:31 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=441) DEBUG 04-22 01:39:31 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=442) 2026-04-22 01:39:31,598 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP0 pid=441) 2026-04-22 01:39:31,599 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP1 pid=442) DEBUG 04-22 01:39:31 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=441) DEBUG 04-22 01:39:31 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=442) 2026-04-22 01:39:31,612 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP0 pid=441) 2026-04-22 01:39:31,613 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP1 pid=442) DEBUG 04-22 01:39:31 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=442) DEBUG 04-22 01:39:31 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=442) DEBUG 04-22 01:39:31 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=441) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=242) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=242) DEBUG 04-22 01:39:36 [config/vllm.py:1530] Max num batched tokens below allreduce-rms fusion threshold, allreduce-rms fusion will be enabled for all num_tokens. +(EngineCore pid=242) INFO 04-22 01:39:36 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(Worker_TP0 pid=441) DEBUG 04-22 01:39:36 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=442) DEBUG 04-22 01:39:36 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=242) DEBUG 04-22 01:39:36 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=242) DEBUG 04-22 01:39:36 [v1/engine/core.py:1158] EngineCore waiting for work. +(EngineCore pid=242) DEBUG 04-22 01:39:36 [v1/engine/core.py:1158] EngineCore waiting for work. +(APIServer pid=1) INFO 04-22 01:39:36 [entrypoints/openai/api_server.py:590] Supported tasks: ['generate'] +(APIServer pid=1) WARNING 04-22 01:39:37 [config/model.py:1435] Default vLLM sampling parameters have been overridden by the model's `generation_config.json`: `{'repetition_penalty': 1.05, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8}`. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`. +(APIServer pid=1) DEBUG 04-22 01:39:37 [renderers/base.py:197] Warming up chat template processing... +(APIServer pid=1) DEBUG 04-22 01:39:37 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e826d9-2c80016f419c8b023123a696;b016afaf-c2fd-411a-aa52-9e6efd66e7ec) +(APIServer pid=1) DEBUG 04-22 01:39:37 [transformers_utils/repo_utils.py:243] +(APIServer pid=1) DEBUG 04-22 01:39:37 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/resolve/main/processor_config.json. +(APIServer pid=1) DEBUG 04-22 01:39:37 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e826d9-30b1e4fc4eae13e100557232;38380ba9-6fd5-4d55-b45f-28872cddb86b) +(APIServer pid=1) DEBUG 04-22 01:39:37 [transformers_utils/repo_utils.py:243] +(APIServer pid=1) DEBUG 04-22 01:39:37 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/resolve/main/preprocessor_config.json. +(Worker_TP1 pid=442) DEBUG 04-22 01:39:37 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=441) DEBUG 04-22 01:39:37 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(APIServer pid=1) INFO 04-22 01:39:38 [renderers/hf.py:314] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this. +(APIServer pid=1) DEBUG 04-22 01:39:38 [renderers/base.py:203] Chat template warmup completed in 0.762s +(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/openai/api_server.py:594] Starting vLLM server on http://0.0.0.0:8000 +(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:37] Available routes are: +(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /openapi.json, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /docs, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /docs/oauth2-redirect, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /redoc, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /tokenize, Methods: POST +(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /detokenize, Methods: POST +(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /load, Methods: GET +(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /version, Methods: GET +(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /health, Methods: GET +(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /metrics, Methods: GET +(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /v1/models, Methods: GET +(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /ping, Methods: GET +(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /ping, Methods: POST +(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /invocations, Methods: POST +(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /v1/chat/completions, Methods: POST +(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST +(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /v1/responses, Methods: POST +(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET +(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST +(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /v1/completions, Methods: POST +(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /v1/messages, Methods: POST +(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST +(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /inference/v1/generate, Methods: POST +(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /scale_elastic_ep, Methods: POST +(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST +(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /v1/chat/completions/render, Methods: POST +(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /v1/completions/render, Methods: POST +(APIServer pid=1) INFO: Started server process [1] +(APIServer pid=1) INFO: Waiting for application startup. +(APIServer pid=1) INFO: Application startup complete. +(APIServer pid=1) DEBUG 04-22 01:39:43 [v1/engine/async_llm.py:875] Called check_health. +(APIServer pid=1) INFO: 10.129.8.2:39450 - "GET /health HTTP/1.1" 200 OK diff --git a/accuracy/results/v0.19.0/logs/qwen-qwen2-5-7b-instruct--h100-80gb--tp4pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/qwen-qwen2-5-7b-instruct--h100-80gb--tp4pp1dp1--8192.log new file mode 100644 index 00000000..884fc6de --- /dev/null +++ b/accuracy/results/v0.19.0/logs/qwen-qwen2-5-7b-instruct--h100-80gb--tp4pp1dp1--8192.log @@ -0,0 +1,2766 @@ +DEBUG 04-22 01:39:53 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:39:53 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:39:53 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:39:53 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:39:53 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:39:53 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:39:53 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:39:53 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:39:53 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:39:53 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:39:53 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:39:53 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:39:58 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:40:00 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' +DEBUG 04-22 01:40:00 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:40:00 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:40:00 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:40:00 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 01:40:00 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:40:00 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 01:40:00 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 01:40:00 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model Qwen/Qwen2.5-7B-Instruct +(APIServer pid=1) INFO 04-22 01:40:00 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 01:40:00 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:40:00 [entrypoints/utils.py:233] non-default args: {'model_tag': 'Qwen/Qwen2.5-7B-Instruct', 'model': 'Qwen/Qwen2.5-7B-Instruct', 'max_model_len': 8192, 'tensor_parallel_size': 4, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 01:40:00 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 01:40:00 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen2.Qwen2ForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 01:40:00 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0004032 secs +(APIServer pid=1) INFO 04-22 01:40:00 [config/model.py:549] Resolved architecture: Qwen2ForCausalLM +(APIServer pid=1) INFO 04-22 01:40:00 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 01:40:00 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 01:40:00 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 01:40:00 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 01:40:00 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 01:40:00 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 01:40:00 [config/parallel.py:743] Defaulting to use mp for distributed inference +(APIServer pid=1) INFO 04-22 01:40:00 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 01:40:00 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(APIServer pid=1) INFO 04-22 01:40:02 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(APIServer pid=1) DEBUG 04-22 01:40:02 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 01:40:02 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:40:02 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:40:02 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 01:40:06 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:40:06 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:40:06 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:40:06 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:40:06 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:40:06 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:40:06 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:40:06 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:40:06 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:40:06 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:40:06 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:40:06 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:40:11 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=245) DEBUG 04-22 01:40:12 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 01:40:12 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=245) DEBUG 04-22 01:40:12 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/ec127100-6ce5-4395-a73b-d83623525879'], outputs=['ipc:///tmp/86065b28-fa27-42f9-9a3a-eec26e577961'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=245) DEBUG 04-22 01:40:12 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=245) DEBUG 04-22 01:40:12 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=245) DEBUG 04-22 01:40:12 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=245) DEBUG 04-22 01:40:12 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=245) DEBUG 04-22 01:40:12 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=245) INFO 04-22 01:40:12 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='Qwen/Qwen2.5-7B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-7B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=4, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen2.5-7B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [292, 8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=245) WARNING 04-22 01:40:12 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. +(EngineCore pid=245) INFO 04-22 01:40:12 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.131.3.124 (local), world_size=4, local_world_size=4 +(EngineCore pid=245) DEBUG 04-22 01:40:12 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/cb21d1be-33a2-4133-82b9-d034295f0bd6 +(EngineCore pid=245) DEBUG 04-22 01:40:12 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1, 2, 3], buffer_handle=(4, 16777216, 10, 'psm_c6d422d2'), local_subscribe_addr='ipc:///tmp/cb21d1be-33a2-4133-82b9-d034295f0bd6', local_notify_addr='ipc:///tmp/c67727b1-0e20-4e5c-8290-cb9d665888f0', remote_subscribe_addr=None, remote_addr_ipv6=False) +DEBUG 04-22 01:40:16 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:40:16 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:40:16 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:40:16 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:40:16 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:40:16 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:40:16 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:40:16 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:40:16 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:40:16 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:40:16 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:40:16 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:40:16 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:40:16 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:40:16 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:40:16 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:40:16 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:40:16 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:40:16 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:40:16 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:40:16 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:40:16 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:40:16 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:40:16 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:40:16 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:40:16 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:40:16 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:40:16 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:40:16 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:40:16 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:40:16 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:40:16 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:40:16 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:40:16 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:40:16 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:40:16 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:40:16 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:40:16 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:40:16 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:40:16 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:40:16 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:40:16 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:40:16 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:40:16 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:40:16 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:40:16 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:40:16 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:40:16 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:40:21 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:40:21 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:40:21 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:40:21 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:40:22 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:40:22 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:40:22 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:40:22 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) DEBUG 04-22 01:40:22 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +DEBUG 04-22 01:40:22 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:40:22 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:40:22 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:40:22 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 01:40:22 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:40:22 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:40:22 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:40:22 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 01:40:22 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:40:22 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:40:22 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:40:22 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(Worker pid=444) DEBUG 04-22 01:40:23 [distributed/parallel_state.py:1356] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:50769 backend=nccl +(Worker pid=444) INFO 04-22 01:40:23 [distributed/parallel_state.py:1400] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:50769 backend=nccl +(Worker pid=447) DEBUG 04-22 01:40:23 [distributed/parallel_state.py:1356] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:50769 backend=nccl +(Worker pid=447) INFO 04-22 01:40:23 [distributed/parallel_state.py:1400] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:50769 backend=nccl +(Worker pid=446) DEBUG 04-22 01:40:24 [distributed/parallel_state.py:1356] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:50769 backend=nccl +(Worker pid=446) INFO 04-22 01:40:24 [distributed/parallel_state.py:1400] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:50769 backend=nccl +(Worker pid=445) DEBUG 04-22 01:40:24 [distributed/parallel_state.py:1356] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:50769 backend=nccl +(Worker pid=445) INFO 04-22 01:40:24 [distributed/parallel_state.py:1400] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:50769 backend=nccl +[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +(Worker pid=444) DEBUG 04-22 01:40:24 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=447) DEBUG 04-22 01:40:24 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=446) DEBUG 04-22 01:40:24 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=445) DEBUG 04-22 01:40:24 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +(Worker pid=444) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=444) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=447) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=447) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=445) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=445) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=446) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=446) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=444) DEBUG 04-22 01:40:24 [utils/nccl.py:34] Found nccl from library libnccl.so.2 +(Worker pid=444) INFO 04-22 01:40:24 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 +(Worker pid=445) DEBUG 04-22 01:40:25 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=446) DEBUG 04-22 01:40:25 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=444) DEBUG 04-22 01:40:25 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=447) DEBUG 04-22 01:40:25 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=444) DEBUG 04-22 01:40:25 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/567daeca-6b5c-4b71-b54f-b51254ee4b10 +(Worker pid=444) DEBUG 04-22 01:40:25 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1, 2, 3], buffer_handle=(3, 4194304, 6, 'psm_14064846'), local_subscribe_addr='ipc:///tmp/567daeca-6b5c-4b71-b54f-b51254ee4b10', local_notify_addr='ipc:///tmp/008725ac-f2cf-41d2-a95c-2f8274b2d1fd', remote_subscribe_addr=None, remote_addr_ipv6=False) +(Worker pid=446) DEBUG 04-22 01:40:25 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/567daeca-6b5c-4b71-b54f-b51254ee4b10 +(Worker pid=445) DEBUG 04-22 01:40:25 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/567daeca-6b5c-4b71-b54f-b51254ee4b10 +(Worker pid=447) DEBUG 04-22 01:40:25 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/567daeca-6b5c-4b71-b54f-b51254ee4b10 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(Worker pid=444) INFO 04-22 01:40:25 [distributed/parallel_state.py:1716] rank 0 in world size 4 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(Worker pid=444) DEBUG 04-22 01:40:25 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776822025.549855, auto_measure=True +(Worker pid=444) DEBUG 04-22 01:40:25 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=447) DEBUG 04-22 01:40:25 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776822025.587866, auto_measure=True +(Worker pid=447) DEBUG 04-22 01:40:25 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=446) DEBUG 04-22 01:40:25 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776822025.6051173, auto_measure=True +(Worker pid=446) DEBUG 04-22 01:40:25 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=445) DEBUG 04-22 01:40:25 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776822025.6054552, auto_measure=True +(Worker pid=445) DEBUG 04-22 01:40:25 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=444) DEBUG 04-22 01:40:25 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=444) DEBUG 04-22 01:40:25 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=444) DEBUG 04-22 01:40:25 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=447) DEBUG 04-22 01:40:25 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=447) DEBUG 04-22 01:40:25 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=444) DEBUG 04-22 01:40:25 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=447) DEBUG 04-22 01:40:25 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=444) DEBUG 04-22 01:40:25 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(Worker pid=446) DEBUG 04-22 01:40:25 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=445) DEBUG 04-22 01:40:25 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=446) DEBUG 04-22 01:40:25 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=445) DEBUG 04-22 01:40:25 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=444) DEBUG 04-22 01:40:25 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=446) DEBUG 04-22 01:40:25 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=445) DEBUG 04-22 01:40:25 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=447) DEBUG 04-22 01:40:25 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=447) DEBUG 04-22 01:40:25 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=446) DEBUG 04-22 01:40:25 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=445) DEBUG 04-22 01:40:25 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=444) DEBUG 04-22 01:40:25 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(Worker_TP0 pid=444) INFO 04-22 01:40:25 [v1/worker/gpu_model_runner.py:4735] Starting to load model Qwen/Qwen2.5-7B-Instruct... +(Worker pid=445) DEBUG 04-22 01:40:25 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=446) DEBUG 04-22 01:40:25 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker_TP0 pid=444) DEBUG 04-22 01:40:26 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(Worker_TP0 pid=444) INFO 04-22 01:40:26 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(Worker_TP0 pid=444) INFO 04-22 01:40:26 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(Worker_TP3 pid=447) DEBUG 04-22 01:40:26 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP3 pid=447) DEBUG 04-22 01:40:26 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP3 pid=447) DEBUG 04-22 01:40:26 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 57, 'silu_and_mul': 28, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP3 pid=447) DEBUG 04-22 01:40:26 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP0 pid=444) DEBUG 04-22 01:40:26 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP0 pid=444) DEBUG 04-22 01:40:26 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP0 pid=444) DEBUG 04-22 01:40:26 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 57, 'silu_and_mul': 28, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP0 pid=444) DEBUG 04-22 01:40:26 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP2 pid=446) DEBUG 04-22 01:40:26 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP2 pid=446) DEBUG 04-22 01:40:26 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP2 pid=446) DEBUG 04-22 01:40:26 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 57, 'silu_and_mul': 28, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP2 pid=446) DEBUG 04-22 01:40:26 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP1 pid=445) DEBUG 04-22 01:40:26 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP1 pid=445) DEBUG 04-22 01:40:26 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP1 pid=445) DEBUG 04-22 01:40:26 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 57, 'silu_and_mul': 28, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP1 pid=445) DEBUG 04-22 01:40:26 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP2 pid=446) DEBUG 04-22 01:40:26 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00003-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00002-of-00004.safetensors', 'model-00001-of-00004.safetensors']] +(Worker_TP0 pid=444) DEBUG 04-22 01:40:26 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00004-of-00004.safetensors', 'model-00002-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'model-00003-of-00004.safetensors']] +(Worker_TP3 pid=447) DEBUG 04-22 01:40:26 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00001-of-00004.safetensors']] +(Worker_TP0 pid=444) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 +(Worker_TP2 pid=446) DEBUG 04-22 01:40:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=447) DEBUG 04-22 01:40:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=446) DEBUG 04-22 01:40:30 [compilation/decorators.py:528] Start compiling function +(Worker_TP3 pid=447) DEBUG 04-22 01:40:30 [compilation/decorators.py:528] Start compiling function +(Worker_TP0 pid=444) DEBUG 04-22 01:40:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 01:40:30 [compilation/decorators.py:528] Start compiling function +(EngineCore pid=245) DEBUG 04-22 01:40:30 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(APIServer pid=1) DEBUG 04-22 01:40:32 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=40173243b0 comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/926abca848/rank_1_0/backbone +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] Vllm config hash: 40173243b0 +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP0 pid=444) INFO 04-22 01:40:34 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/926abca848/rank_0_0/backbone for vLLM's torch.compile +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=40173243b0 comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/926abca848/rank_0_0/backbone +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=40173243b0 comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/926abca848/rank_2_0/backbone +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] Vllm config hash: 40173243b0 +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] Vllm config hash: 40173243b0 +(Worker_TP0 pid=444) INFO 04-22 01:40:34 [compilation/backends.py:1111] Dynamo bytecode transform time: 3.91 s +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/.../fusion/allreduce_rms_fusion.py:765] Flashinfer max size: 2 MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: 292 +(Worker_TP0 pid=444) INFO 04-22 01:40:34 [distributed/device_communicators/flashinfer_all_reduce.py:109] Auto-selected flashinfer allreduce backend: trtllm +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=40173243b0 comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/926abca848/rank_3_0/backbone +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] Vllm config hash: 40173243b0 +(Worker_TP0 pid=444) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. +(Worker_TP0 pid=444) return func(*args, **kwargs) +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=0, max_token_num=292, hidden_dim=3584, dtype=torch.bfloat16 +(Worker_TP1 pid=445) DEBUG 04-22 01:40:34 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=1, max_token_num=292, hidden_dim=3584, dtype=torch.bfloat16 +(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=2, max_token_num=292, hidden_dim=3584, dtype=torch.bfloat16 +(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=3, max_token_num=292, hidden_dim=3584, dtype=torch.bfloat16 +(Worker_TP0 pid=444) INFO 04-22 01:40:34 [distributed/device_communicators/flashinfer_all_reduce.py:149] Initialized FlashInfer Allreduce norm fusion workspace with backend=trtllm +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 292), (293, 8192)] +(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(Worker_TP2 pid=446) DEBUG 04-22 01:40:35 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP2 pid=446) DEBUG 04-22 01:40:35 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:40:35 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=445) DEBUG 04-22 01:40:35 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:40:35 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=444) DEBUG 04-22 01:40:35 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP3 pid=447) DEBUG 04-22 01:40:35 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP3 pid=447) DEBUG 04-22 01:40:35 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP2 pid=446) DEBUG 04-22 01:40:35 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns +(Worker_TP2 pid=446) DEBUG 04-22 01:40:35 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 34.0 ms +(Worker_TP2 pid=446) DEBUG 04-22 01:40:35 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP2 pid=446) DEBUG 04-22 01:40:35 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes +(Worker_TP2 pid=446) DEBUG 04-22 01:40:35 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:40:35 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns +(Worker_TP1 pid=445) DEBUG 04-22 01:40:35 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 32.9 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:40:35 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:40:35 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes +(Worker_TP1 pid=445) DEBUG 04-22 01:40:35 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:40:35 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns +(Worker_TP0 pid=444) DEBUG 04-22 01:40:35 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 33.8 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:40:35 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:40:35 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes +(Worker_TP0 pid=444) DEBUG 04-22 01:40:35 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP3 pid=447) DEBUG 04-22 01:40:35 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns +(Worker_TP3 pid=447) DEBUG 04-22 01:40:35 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 33.9 ms +(Worker_TP3 pid=447) DEBUG 04-22 01:40:35 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP3 pid=447) DEBUG 04-22 01:40:35 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes +(Worker_TP3 pid=447) DEBUG 04-22 01:40:35 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP0 pid=444) INFO 04-22 01:40:37 [compilation/backends.py:372] Cache the graph of compile range (1, 292) for later use +(Worker_TP0 pid=444) DEBUG 04-22 01:40:37 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 292) from inductor_standalone via handle ('artifact_compile_range_1_292_subgraph_0', '/data/.cache/vllm/torch_compile_cache/926abca848/rank_0_0/backbone/artifact_compile_range_1_292_subgraph_0') +(Worker_TP1 pid=445) DEBUG 04-22 01:40:38 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=445) DEBUG 04-22 01:40:38 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:40:38 [compilation/passes/pass_manager.py:100] Skipping with compile range (293, 8192) +(Worker_TP1 pid=445) DEBUG 04-22 01:40:38 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:40:38 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=445) DEBUG 04-22 01:40:38 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP2 pid=446) DEBUG 04-22 01:40:38 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP2 pid=446) DEBUG 04-22 01:40:38 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP2 pid=446) DEBUG 04-22 01:40:38 [compilation/passes/pass_manager.py:100] Skipping with compile range (293, 8192) +(Worker_TP2 pid=446) DEBUG 04-22 01:40:38 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP2 pid=446) DEBUG 04-22 01:40:38 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP2 pid=446) DEBUG 04-22 01:40:38 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:40:38 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=444) DEBUG 04-22 01:40:38 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:40:38 [compilation/passes/pass_manager.py:100] Skipping with compile range (293, 8192) +(Worker_TP0 pid=444) DEBUG 04-22 01:40:38 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:40:38 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=444) DEBUG 04-22 01:40:38 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP3 pid=447) DEBUG 04-22 01:40:38 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP3 pid=447) DEBUG 04-22 01:40:38 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP3 pid=447) DEBUG 04-22 01:40:38 [compilation/passes/pass_manager.py:100] Skipping with compile range (293, 8192) +(Worker_TP3 pid=447) DEBUG 04-22 01:40:38 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP3 pid=447) DEBUG 04-22 01:40:38 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP3 pid=447) DEBUG 04-22 01:40:38 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=444) INFO 04-22 01:40:39 [compilation/backends.py:372] Cache the graph of compile range (293, 8192) for later use +(Worker_TP0 pid=444) DEBUG 04-22 01:40:39 [compilation/backends.py:377] Store the 0-th graph for compile range(293, 8192) from inductor_standalone via handle ('artifact_compile_range_293_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/926abca848/rank_0_0/backbone/artifact_compile_range_293_8192_subgraph_0') +(Worker_TP1 pid=445) DEBUG 04-22 01:40:39 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=445) DEBUG 04-22 01:40:39 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP2 pid=446) DEBUG 04-22 01:40:39 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP2 pid=446) DEBUG 04-22 01:40:39 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:40:39 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=444) DEBUG 04-22 01:40:39 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:40:39 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP1 pid=445) DEBUG 04-22 01:40:39 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 60.6 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:40:39 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:40:39 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP1 pid=445) DEBUG 04-22 01:40:39 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP2 pid=446) DEBUG 04-22 01:40:39 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP2 pid=446) DEBUG 04-22 01:40:39 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 59.6 ms +(Worker_TP2 pid=446) DEBUG 04-22 01:40:39 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP2 pid=446) DEBUG 04-22 01:40:39 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP2 pid=446) DEBUG 04-22 01:40:39 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:40:39 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP0 pid=444) DEBUG 04-22 01:40:39 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 60.2 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:40:39 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:40:39 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP0 pid=444) DEBUG 04-22 01:40:39 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP3 pid=447) DEBUG 04-22 01:40:39 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP3 pid=447) DEBUG 04-22 01:40:39 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms +(Worker_TP3 pid=447) DEBUG 04-22 01:40:39 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP3 pid=447) DEBUG 04-22 01:40:39 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 71.6 ms +(Worker_TP3 pid=447) DEBUG 04-22 01:40:39 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP3 pid=447) DEBUG 04-22 01:40:39 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP3 pid=447) DEBUG 04-22 01:40:39 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.6 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:40:39 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 292) from inductor_standalone via handle ('artifact_compile_range_1_292_subgraph_1', '/data/.cache/vllm/torch_compile_cache/926abca848/rank_0_0/backbone/artifact_compile_range_1_292_subgraph_1') +(Worker_TP1 pid=445) DEBUG 04-22 01:40:40 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=445) DEBUG 04-22 01:40:40 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:40:40 [compilation/passes/pass_manager.py:100] Skipping with compile range (293, 8192) +(Worker_TP1 pid=445) DEBUG 04-22 01:40:40 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:40:40 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=445) DEBUG 04-22 01:40:40 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP2 pid=446) DEBUG 04-22 01:40:40 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP2 pid=446) DEBUG 04-22 01:40:40 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP2 pid=446) DEBUG 04-22 01:40:40 [compilation/passes/pass_manager.py:100] Skipping with compile range (293, 8192) +(Worker_TP2 pid=446) DEBUG 04-22 01:40:40 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP2 pid=446) DEBUG 04-22 01:40:40 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP2 pid=446) DEBUG 04-22 01:40:40 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:40:40 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=444) DEBUG 04-22 01:40:40 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:40:40 [compilation/passes/pass_manager.py:100] Skipping with compile range (293, 8192) +(Worker_TP0 pid=444) DEBUG 04-22 01:40:40 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:40:40 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=444) DEBUG 04-22 01:40:40 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP3 pid=447) DEBUG 04-22 01:40:40 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP3 pid=447) DEBUG 04-22 01:40:40 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.1 ms +(Worker_TP3 pid=447) DEBUG 04-22 01:40:40 [compilation/passes/pass_manager.py:100] Skipping with compile range (293, 8192) +(Worker_TP3 pid=447) DEBUG 04-22 01:40:40 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP3 pid=447) DEBUG 04-22 01:40:40 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP3 pid=447) DEBUG 04-22 01:40:40 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:40:42 [compilation/backends.py:377] Store the 1-th graph for compile range(293, 8192) from inductor_standalone via handle ('artifact_compile_range_293_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/926abca848/rank_0_0/backbone/artifact_compile_range_293_8192_subgraph_1') +(APIServer pid=1) DEBUG 04-22 01:40:42 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP2 pid=446) DEBUG 04-22 01:40:43 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP2 pid=446) DEBUG 04-22 01:40:43 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:40:43 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=445) DEBUG 04-22 01:40:43 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP2 pid=446) DEBUG 04-22 01:40:43 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP2 pid=446) DEBUG 04-22 01:40:43 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 60.5 ms +(Worker_TP2 pid=446) DEBUG 04-22 01:40:43 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms +(Worker_TP2 pid=446) DEBUG 04-22 01:40:43 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP2 pid=446) DEBUG 04-22 01:40:43 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:40:43 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=444) DEBUG 04-22 01:40:43 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:40:43 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP1 pid=445) DEBUG 04-22 01:40:43 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 60.6 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:40:43 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:40:43 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP1 pid=445) DEBUG 04-22 01:40:43 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:40:43 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP0 pid=444) DEBUG 04-22 01:40:43 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 61.4 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:40:43 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:40:43 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP0 pid=444) DEBUG 04-22 01:40:43 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:40:44 [compilation/backends.py:377] Store the 28-th graph for compile range(1, 292) from inductor_standalone via handle ('artifact_compile_range_1_292_subgraph_28', '/data/.cache/vllm/torch_compile_cache/926abca848/rank_0_0/backbone/artifact_compile_range_1_292_subgraph_28') +(Worker_TP0 pid=444) INFO 04-22 01:40:44 [compilation/backends.py:390] Compiling a graph for compile range (1, 292) takes 6.28 s +(Worker_TP2 pid=446) DEBUG 04-22 01:40:44 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP2 pid=446) DEBUG 04-22 01:40:44 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP2 pid=446) DEBUG 04-22 01:40:44 [compilation/passes/pass_manager.py:100] Skipping with compile range (293, 8192) +(Worker_TP2 pid=446) DEBUG 04-22 01:40:44 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP2 pid=446) DEBUG 04-22 01:40:44 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP2 pid=446) DEBUG 04-22 01:40:44 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:40:44 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=445) DEBUG 04-22 01:40:44 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:40:44 [compilation/passes/pass_manager.py:100] Skipping with compile range (293, 8192) +(Worker_TP1 pid=445) DEBUG 04-22 01:40:44 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.5 ms +(Worker_TP1 pid=445) DEBUG 04-22 01:40:44 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=445) DEBUG 04-22 01:40:44 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:40:44 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=444) DEBUG 04-22 01:40:44 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:40:44 [compilation/passes/pass_manager.py:100] Skipping with compile range (293, 8192) +(Worker_TP0 pid=444) DEBUG 04-22 01:40:44 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:40:44 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=444) DEBUG 04-22 01:40:44 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP3 pid=447) DEBUG 04-22 01:40:44 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP3 pid=447) DEBUG 04-22 01:40:44 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:40:44 [compilation/backends.py:377] Store the 28-th graph for compile range(293, 8192) from inductor_standalone via handle ('artifact_compile_range_293_8192_subgraph_28', '/data/.cache/vllm/torch_compile_cache/926abca848/rank_0_0/backbone/artifact_compile_range_293_8192_subgraph_28') +(Worker_TP0 pid=444) INFO 04-22 01:40:44 [compilation/backends.py:390] Compiling a graph for compile range (293, 8192) takes 6.91 s +(Worker_TP3 pid=447) DEBUG 04-22 01:40:44 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP3 pid=447) DEBUG 04-22 01:40:44 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 73.2 ms +(Worker_TP3 pid=447) DEBUG 04-22 01:40:44 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP3 pid=447) DEBUG 04-22 01:40:44 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP3 pid=447) DEBUG 04-22 01:40:44 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=444) DEBUG 04-22 01:40:44 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/926abca848/rank_0_0/backbone/computation_graph.py +(Worker_TP3 pid=447) DEBUG 04-22 01:40:44 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP3 pid=447) DEBUG 04-22 01:40:44 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(Worker_TP3 pid=447) DEBUG 04-22 01:40:44 [compilation/passes/pass_manager.py:100] Skipping with compile range (293, 8192) +(Worker_TP3 pid=447) DEBUG 04-22 01:40:44 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP3 pid=447) DEBUG 04-22 01:40:44 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP3 pid=447) DEBUG 04-22 01:40:44 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP0 pid=444) INFO 04-22 01:40:45 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/f81953de96e95b6cb6058e30a79e97ddbce2a651eea9102c7d30fa4c49e7c4f4/rank_0_0/model +(Worker_TP0 pid=444) INFO 04-22 01:40:45 [compilation/monitor.py:48] torch.compile took 15.26 s in total +(Worker_TP0 pid=444) INFO 04-22 01:40:46 [compilation/monitor.py:76] Initial profiling/warmup run took 1.13 s +(Worker_TP0 pid=444) INFO 04-22 01:40:52 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP0 pid=444) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP0 pid=444) INFO 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_TP3 pid=447) INFO 04-22 01:40:52 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP3 pid=447) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP3 pid=447) INFO 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_TP3 pid=447) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=446) INFO 04-22 01:40:52 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP2 pid=446) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP2 pid=446) INFO 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_TP0 pid=444) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=447) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=447) DEBUG 04-22 01:40:52 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=444) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 01:40:52 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP3 pid=447) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=447) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=447) DEBUG 04-22 01:40:52 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=444) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 01:40:52 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP3 pid=447) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 104.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(Worker_TP3 pid=447) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 104.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(Worker_TP0 pid=444) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=447) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=447) DEBUG 04-22 01:40:52 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=444) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 01:40:52 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP3 pid=447) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=447) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=447) DEBUG 04-22 01:40:52 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=444) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 01:40:52 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP3 pid=447) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 60.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(Worker_TP3 pid=447) INFO 04-22 01:40:52 [distributed/device_communicators/custom_all_reduce.py:216] Registering 228 cuda graph addresses +(Worker_TP0 pid=444) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 60.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(Worker_TP0 pid=444) INFO 04-22 01:40:52 [distributed/device_communicators/custom_all_reduce.py:216] Registering 228 cuda graph addresses +(Worker_TP1 pid=445) INFO 04-22 01:40:52 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP1 pid=445) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP1 pid=445) INFO 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(APIServer pid=1) DEBUG 04-22 01:40:52 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP2 pid=446) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=446) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=446) DEBUG 04-22 01:40:53 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP2 pid=446) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=446) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=446) DEBUG 04-22 01:40:53 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP2 pid=446) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 104.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(Worker_TP2 pid=446) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=446) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=446) DEBUG 04-22 01:40:53 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP2 pid=446) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=446) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=446) DEBUG 04-22 01:40:53 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP2 pid=446) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 60.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(Worker_TP2 pid=446) INFO 04-22 01:40:53 [distributed/device_communicators/custom_all_reduce.py:216] Registering 228 cuda graph addresses +(Worker_TP1 pid=445) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 01:40:53 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=445) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 01:40:53 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=445) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 104.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(Worker_TP1 pid=445) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 01:40:53 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=445) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 01:40:53 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=445) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 60.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(Worker_TP1 pid=445) INFO 04-22 01:40:53 [distributed/device_communicators/custom_all_reduce.py:216] Registering 228 cuda graph addresses +(Worker_TP3 pid=447) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP3 pid=447) INFO 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.69 GiB total +(Worker_TP2 pid=446) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP2 pid=446) INFO 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.69 GiB total +(Worker_TP0 pid=444) DEBUG 04-22 01:40:54 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP0 pid=444) INFO 04-22 01:40:54 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.69 GiB total +(Worker_TP1 pid=445) DEBUG 04-22 01:40:54 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP1 pid=445) INFO 04-22 01:40:54 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.69 GiB total +(Worker_TP3 pid=447) DEBUG 04-22 01:40:54 [v1/worker/gpu_worker.py:424] Initial free memory: 77.64 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP3 pid=447) DEBUG 04-22 01:40:54 [v1/worker/gpu_worker.py:430] Free memory after profiling: 71.44 GiB (total), 69.03 GiB (within requested) +(Worker_TP3 pid=447) DEBUG 04-22 01:40:54 [v1/worker/gpu_worker.py:435] Memory profiling takes 23.92 seconds. Total non KV cache memory: 7.89GiB; torch peak memory increase: 2.21GiB; non-torch forward increase memory: 2.13GiB; weights memory: 3.55GiB. +(Worker_TP3 pid=447) INFO 04-22 01:40:54 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9587 to maintain the same effective KV cache size. +(Worker_TP3 pid=447) DEBUG 04-22 01:40:54 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=444) DEBUG 04-22 01:40:54 [v1/worker/gpu_worker.py:424] Initial free memory: 77.64 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP0 pid=444) DEBUG 04-22 01:40:54 [v1/worker/gpu_worker.py:430] Free memory after profiling: 71.44 GiB (total), 69.03 GiB (within requested) +(Worker_TP0 pid=444) DEBUG 04-22 01:40:54 [v1/worker/gpu_worker.py:435] Memory profiling takes 24.14 seconds. Total non KV cache memory: 7.89GiB; torch peak memory increase: 2.21GiB; non-torch forward increase memory: 2.13GiB; weights memory: 3.55GiB. +(Worker_TP0 pid=444) INFO 04-22 01:40:54 [v1/worker/gpu_worker.py:436] Available KV cache memory: 67.34 GiB +(Worker_TP0 pid=444) INFO 04-22 01:40:54 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9587 to maintain the same effective KV cache size. +(Worker_TP0 pid=444) DEBUG 04-22 01:40:54 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=245) DEBUG 04-22 01:40:54 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=245) DEBUG 04-22 01:40:54 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=445) DEBUG 04-22 01:40:54 [v1/worker/gpu_worker.py:424] Initial free memory: 77.64 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP1 pid=445) DEBUG 04-22 01:40:54 [v1/worker/gpu_worker.py:430] Free memory after profiling: 71.44 GiB (total), 69.03 GiB (within requested) +(Worker_TP1 pid=445) DEBUG 04-22 01:40:54 [v1/worker/gpu_worker.py:435] Memory profiling takes 24.25 seconds. Total non KV cache memory: 7.89GiB; torch peak memory increase: 2.21GiB; non-torch forward increase memory: 2.13GiB; weights memory: 3.55GiB. +(Worker_TP1 pid=445) INFO 04-22 01:40:54 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9587 to maintain the same effective KV cache size. +(Worker_TP1 pid=445) DEBUG 04-22 01:40:54 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=245) DEBUG 04-22 01:40:54 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=245) DEBUG 04-22 01:40:54 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP2 pid=446) DEBUG 04-22 01:40:54 [v1/worker/gpu_worker.py:424] Initial free memory: 77.64 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP2 pid=446) DEBUG 04-22 01:40:54 [v1/worker/gpu_worker.py:430] Free memory after profiling: 71.44 GiB (total), 69.03 GiB (within requested) +(Worker_TP2 pid=446) DEBUG 04-22 01:40:54 [v1/worker/gpu_worker.py:435] Memory profiling takes 24.19 seconds. Total non KV cache memory: 7.89GiB; torch peak memory increase: 2.21GiB; non-torch forward increase memory: 2.13GiB; weights memory: 3.55GiB. +(Worker_TP2 pid=446) INFO 04-22 01:40:54 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9587 to maintain the same effective KV cache size. +(Worker_TP2 pid=446) DEBUG 04-22 01:40:54 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=245) DEBUG 04-22 01:40:54 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=245) INFO 04-22 01:40:54 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 5,043,280 tokens +(EngineCore pid=245) INFO 04-22 01:40:54 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 615.63x +(Worker_TP2 pid=446) DEBUG 04-22 01:40:54 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=445) DEBUG 04-22 01:40:54 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=444) DEBUG 04-22 01:40:54 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP3 pid=447) DEBUG 04-22 01:40:54 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP3 pid=447) 2026-04-22 01:40:54,441 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP2 pid=446) 2026-04-22 01:40:54,441 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP1 pid=445) 2026-04-22 01:40:54,441 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP0 pid=444) 2026-04-22 01:40:54,441 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP3 pid=447) DEBUG 04-22 01:40:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=446) DEBUG 04-22 01:40:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 01:40:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 01:40:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=447) 2026-04-22 01:40:54,457 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP0 pid=444) 2026-04-22 01:40:54,458 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP1 pid=445) 2026-04-22 01:40:54,458 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP2 pid=446) 2026-04-22 01:40:54,458 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP3 pid=447) DEBUG 04-22 01:40:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=447) DEBUG 04-22 01:40:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=447) DEBUG 04-22 01:40:54 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP3 pid=447) DEBUG 04-22 01:40:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=447) DEBUG 04-22 01:40:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=447) DEBUG 04-22 01:40:54 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP3 pid=447) DEBUG 04-22 01:40:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=245) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=245) INFO 04-22 01:40:59 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(Worker_TP0 pid=444) DEBUG 04-22 01:40:59 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP2 pid=446) DEBUG 04-22 01:40:59 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=445) DEBUG 04-22 01:40:59 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP3 pid=447) DEBUG 04-22 01:40:59 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=245) DEBUG 04-22 01:40:59 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=245) DEBUG 04-22 01:40:59 [v1/engine/core.py:1158] EngineCore waiting for work. +(EngineCore pid=245) DEBUG 04-22 01:40:59 [v1/engine/core.py:1158] EngineCore waiting for work. +(APIServer pid=1) INFO 04-22 01:40:59 [entrypoints/openai/api_server.py:590] Supported tasks: ['generate'] +(APIServer pid=1) WARNING 04-22 01:41:00 [config/model.py:1435] Default vLLM sampling parameters have been overridden by the model's `generation_config.json`: `{'repetition_penalty': 1.05, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8}`. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`. +(APIServer pid=1) DEBUG 04-22 01:41:00 [renderers/base.py:197] Warming up chat template processing... +(APIServer pid=1) DEBUG 04-22 01:41:00 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e8272c-2c488e1d2cc021b830dd2d98;f63a5c98-c381-4f93-974b-8d392aa77a95) +(APIServer pid=1) DEBUG 04-22 01:41:00 [transformers_utils/repo_utils.py:243] +(APIServer pid=1) DEBUG 04-22 01:41:00 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/resolve/main/processor_config.json. +(APIServer pid=1) DEBUG 04-22 01:41:00 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e8272c-779a7a1a4cc3e6661e65eebc;e6a5907f-a49c-4e52-9bc2-7db82e7d73db) +(APIServer pid=1) DEBUG 04-22 01:41:00 [transformers_utils/repo_utils.py:243] +(APIServer pid=1) DEBUG 04-22 01:41:00 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/resolve/main/preprocessor_config.json. +(Worker_TP1 pid=445) DEBUG 04-22 01:41:00 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=444) DEBUG 04-22 01:41:00 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP2 pid=446) DEBUG 04-22 01:41:00 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP3 pid=447) DEBUG 04-22 01:41:00 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(APIServer pid=1) INFO 04-22 01:41:01 [renderers/hf.py:314] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this. +(APIServer pid=1) DEBUG 04-22 01:41:01 [renderers/base.py:203] Chat template warmup completed in 0.733s +(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/openai/api_server.py:594] Starting vLLM server on http://0.0.0.0:8000 +(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:37] Available routes are: +(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /openapi.json, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /docs, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /docs/oauth2-redirect, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /redoc, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /tokenize, Methods: POST +(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /detokenize, Methods: POST +(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /load, Methods: GET +(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /version, Methods: GET +(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /health, Methods: GET +(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /metrics, Methods: GET +(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /v1/models, Methods: GET +(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /ping, Methods: GET +(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /ping, Methods: POST +(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /invocations, Methods: POST +(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /v1/chat/completions, Methods: POST +(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST +(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /v1/responses, Methods: POST +(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET +(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST +(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /v1/completions, Methods: POST +(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /v1/messages, Methods: POST +(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST +(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /inference/v1/generate, Methods: POST +(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /scale_elastic_ep, Methods: POST +(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST +(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /v1/chat/completions/render, Methods: POST +(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /v1/completions/render, Methods: POST +(APIServer pid=1) INFO: Started server process [1] +(APIServer pid=1) INFO: Waiting for application startup. +(APIServer pid=1) INFO: Application startup complete. +(APIServer pid=1) DEBUG 04-22 01:41:09 [v1/engine/async_llm.py:875] Called check_health. +(APIServer pid=1) INFO: 10.131.2.2:40078 - "GET /health HTTP/1.1" 200 OK diff --git a/accuracy/results/v0.19.0/logs/qwen-qwen3-14b--h100-80gb--tp5pp1dp1--8192.FAILED.log b/accuracy/results/v0.19.0/logs/qwen-qwen3-14b--h100-80gb--tp5pp1dp1--8192.FAILED.log new file mode 100644 index 00000000..8c18899e --- /dev/null +++ b/accuracy/results/v0.19.0/logs/qwen-qwen3-14b--h100-80gb--tp5pp1dp1--8192.FAILED.log @@ -0,0 +1,675 @@ +DEBUG 04-22 00:27:10 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:27:10 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:27:10 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:27:10 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:27:10 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:27:10 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:27:10 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:27:10 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:27:10 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:27:10 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:27:10 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:27:10 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:27:15 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 00:27:17 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' +DEBUG 04-22 00:27:17 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 00:27:17 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:27:17 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:27:17 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 00:27:17 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:27:17 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 00:27:17 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 00:27:17 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model Qwen/Qwen3-14B +(APIServer pid=1) INFO 04-22 00:27:17 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 00:27:17 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:27:17 [entrypoints/utils.py:233] non-default args: {'model_tag': 'Qwen/Qwen3-14B', 'model': 'Qwen/Qwen3-14B', 'max_model_len': 8192, 'tensor_parallel_size': 5, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 00:27:17 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 00:27:17 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen3.Qwen3ForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 00:27:17 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0010808 secs +(APIServer pid=1) INFO 04-22 00:27:17 [config/model.py:549] Resolved architecture: Qwen3ForCausalLM +(APIServer pid=1) INFO 04-22 00:27:17 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 00:27:17 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 00:27:17 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 00:27:17 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 00:27:17 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 00:27:17 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 00:27:17 [config/parallel.py:743] Defaulting to use mp for distributed inference +(APIServer pid=1) INFO 04-22 00:27:17 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 00:27:17 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) INFO 04-22 00:27:17 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(APIServer pid=1) DEBUG 04-22 00:27:17 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 00:27:17 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:27:18 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:27:18 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 00:27:21 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:27:21 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:27:21 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:27:21 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:27:21 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:27:21 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:27:21 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:27:21 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:27:21 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:27:21 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:27:21 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:27:21 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:27:26 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=243) DEBUG 04-22 00:27:28 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 00:27:28 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=243) DEBUG 04-22 00:27:28 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/1bc2d50e-7e9a-4172-a205-50e79bb5d230'], outputs=['ipc:///tmp/442333a8-48d7-49cf-8792-67cac68c97ad'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=243) DEBUG 04-22 00:27:28 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=243) DEBUG 04-22 00:27:28 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=243) DEBUG 04-22 00:27:28 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=243) DEBUG 04-22 00:27:28 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=243) DEBUG 04-22 00:27:28 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=243) INFO 04-22 00:27:28 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='Qwen/Qwen3-14B', speculative_config=None, tokenizer='Qwen/Qwen3-14B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=5, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen3-14B, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=243) WARNING 04-22 00:27:28 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. +(EngineCore pid=243) INFO 04-22 00:27:28 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.128.4.195 (local), world_size=5, local_world_size=5 +(EngineCore pid=243) DEBUG 04-22 00:27:28 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/63114b8d-5acb-4b99-8fba-8ac83335cae3 +(EngineCore pid=243) DEBUG 04-22 00:27:28 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1, 2, 3, 4], buffer_handle=(5, 16777216, 10, 'psm_374da923'), local_subscribe_addr='ipc:///tmp/63114b8d-5acb-4b99-8fba-8ac83335cae3', local_notify_addr='ipc:///tmp/82d7443c-6675-4caa-abc2-1d5430bdfb80', remote_subscribe_addr=None, remote_addr_ipv6=False) +DEBUG 04-22 00:27:31 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:27:31 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:27:31 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:27:31 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:27:31 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:27:31 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:27:31 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:27:31 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:27:31 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:27:31 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:27:31 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:27:31 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:27:31 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:27:31 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:27:31 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:27:31 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:27:31 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:27:31 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:27:31 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:27:31 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:27:31 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:27:31 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:27:31 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:27:31 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:27:31 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:27:36 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 00:27:36 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 00:27:37 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 00:27:37 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 00:27:37 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(APIServer pid=1) DEBUG 04-22 00:27:38 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +DEBUG 04-22 00:27:38 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 00:27:38 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:27:38 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:27:38 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 00:27:38 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 00:27:38 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:27:38 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:27:38 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 00:27:38 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 00:27:38 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:27:38 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:27:38 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 00:27:38 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 00:27:38 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:27:38 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:27:38 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 00:27:38 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 00:27:38 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:27:38 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:27:38 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(Worker pid=442) DEBUG 04-22 00:27:40 [distributed/parallel_state.py:1356] world_size=5 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:60649 backend=nccl +(Worker pid=442) INFO 04-22 00:27:40 [distributed/parallel_state.py:1400] world_size=5 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:60649 backend=nccl +(Worker pid=446) DEBUG 04-22 00:27:41 [distributed/parallel_state.py:1356] world_size=5 rank=4 local_rank=4 distributed_init_method=tcp://127.0.0.1:60649 backend=nccl +(Worker pid=446) INFO 04-22 00:27:41 [distributed/parallel_state.py:1400] world_size=5 rank=4 local_rank=4 distributed_init_method=tcp://127.0.0.1:60649 backend=nccl +(Worker pid=444) DEBUG 04-22 00:27:41 [distributed/parallel_state.py:1356] world_size=5 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:60649 backend=nccl +(Worker pid=444) INFO 04-22 00:27:41 [distributed/parallel_state.py:1400] world_size=5 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:60649 backend=nccl +(Worker pid=445) DEBUG 04-22 00:27:41 [distributed/parallel_state.py:1356] world_size=5 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:60649 backend=nccl +(Worker pid=445) INFO 04-22 00:27:41 [distributed/parallel_state.py:1400] world_size=5 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:60649 backend=nccl +(Worker pid=443) DEBUG 04-22 00:27:41 [distributed/parallel_state.py:1356] world_size=5 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:60649 backend=nccl +(Worker pid=443) INFO 04-22 00:27:41 [distributed/parallel_state.py:1400] world_size=5 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:60649 backend=nccl +[Gloo] Rank 1 is connected to 4 peer ranks. Expected number of connected peer ranks is : 4 +[Gloo] Rank 0 is connected to 4 peer ranks. Expected number of connected peer ranks is : 4 +[Gloo] Rank 2 is connected to 4 peer ranks. Expected number of connected peer ranks is : 4 +[Gloo] Rank 4 is connected to 4 peer ranks. Expected number of connected peer ranks is : 4 +[Gloo] Rank 3 is connected to 4 peer ranks. Expected number of connected peer ranks is : 4 +(Worker pid=445) DEBUG 04-22 00:27:41 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=444) DEBUG 04-22 00:27:41 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=443) DEBUG 04-22 00:27:41 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=442) DEBUG 04-22 00:27:41 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=446) DEBUG 04-22 00:27:41 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 2 is connected to 4 peer ranks. Expected number of connected peer ranks is : 4 +[Gloo] Rank 1 is connected to 4 peer ranks. Expected number of connected peer ranks is : 4 +[Gloo] Rank 0 is connected to 4 peer ranks. Expected number of connected peer ranks is : 4 +[Gloo] Rank 3 is connected to 4 peer ranks. Expected number of connected peer ranks is : 4 +[Gloo] Rank 4 is connected to 4 peer ranks. Expected number of connected peer ranks is : 4 +(Worker pid=444) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=446) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=444) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=446) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=442) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=442) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=442) DEBUG 04-22 00:27:41 [utils/nccl.py:34] Found nccl from library libnccl.so.2 +(Worker pid=442) INFO 04-22 00:27:41 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 +(Worker pid=445) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=445) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=443) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=443) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=442) WARNING 04-22 00:27:42 [distributed/device_communicators/symm_mem.py:73] SymmMemCommunicator: World size 5 not supported, communicator is not available. +(Worker pid=445) WARNING 04-22 00:27:42 [distributed/device_communicators/symm_mem.py:73] SymmMemCommunicator: World size 5 not supported, communicator is not available. +(Worker pid=443) WARNING 04-22 00:27:42 [distributed/device_communicators/symm_mem.py:73] SymmMemCommunicator: World size 5 not supported, communicator is not available. +(Worker pid=446) WARNING 04-22 00:27:42 [distributed/device_communicators/symm_mem.py:73] SymmMemCommunicator: World size 5 not supported, communicator is not available. +(Worker pid=444) WARNING 04-22 00:27:42 [distributed/device_communicators/symm_mem.py:73] SymmMemCommunicator: World size 5 not supported, communicator is not available. +(Worker pid=444) WARNING 04-22 00:27:42 [distributed/device_communicators/custom_all_reduce.py:106] Custom allreduce is disabled due to an unsupported world size: 5. Supported world sizes: [2, 4, 6, 8]. To silence this warning, specify disable_custom_all_reduce=True explicitly. +(Worker pid=442) WARNING 04-22 00:27:42 [distributed/device_communicators/custom_all_reduce.py:106] Custom allreduce is disabled due to an unsupported world size: 5. Supported world sizes: [2, 4, 6, 8]. To silence this warning, specify disable_custom_all_reduce=True explicitly. +(Worker pid=446) WARNING 04-22 00:27:42 [distributed/device_communicators/custom_all_reduce.py:106] Custom allreduce is disabled due to an unsupported world size: 5. Supported world sizes: [2, 4, 6, 8]. To silence this warning, specify disable_custom_all_reduce=True explicitly. +(Worker pid=443) WARNING 04-22 00:27:42 [distributed/device_communicators/custom_all_reduce.py:106] Custom allreduce is disabled due to an unsupported world size: 5. Supported world sizes: [2, 4, 6, 8]. To silence this warning, specify disable_custom_all_reduce=True explicitly. +(Worker pid=445) WARNING 04-22 00:27:42 [distributed/device_communicators/custom_all_reduce.py:106] Custom allreduce is disabled due to an unsupported world size: 5. Supported world sizes: [2, 4, 6, 8]. To silence this warning, specify disable_custom_all_reduce=True explicitly. +(Worker pid=442) DEBUG 04-22 00:27:42 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/7138be5c-96a9-4137-95c0-b4d63d41bb97 +(Worker pid=442) DEBUG 04-22 00:27:42 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1, 2, 3, 4], buffer_handle=(4, 4194304, 6, 'psm_973a6fc4'), local_subscribe_addr='ipc:///tmp/7138be5c-96a9-4137-95c0-b4d63d41bb97', local_notify_addr='ipc:///tmp/835c2a1c-53fe-42e2-9c0c-85a4cc18cdd1', remote_subscribe_addr=None, remote_addr_ipv6=False) +(Worker pid=446) DEBUG 04-22 00:27:42 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/7138be5c-96a9-4137-95c0-b4d63d41bb97 +(Worker pid=444) DEBUG 04-22 00:27:42 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/7138be5c-96a9-4137-95c0-b4d63d41bb97 +(Worker pid=445) DEBUG 04-22 00:27:42 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/7138be5c-96a9-4137-95c0-b4d63d41bb97 +(Worker pid=443) DEBUG 04-22 00:27:42 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/7138be5c-96a9-4137-95c0-b4d63d41bb97 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(Worker pid=442) INFO 04-22 00:27:42 [distributed/parallel_state.py:1716] rank 0 in world size 5 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(Worker pid=444) DEBUG 04-22 00:27:43 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=77.71GiB, total_memory=79.19GiB, cuda_memory=1.48GiB, torch_memory=0.0GiB, non_torch_memory=1.48GiB, timestamp=1776817663.1936586, auto_measure=True +(Worker pid=444) DEBUG 04-22 00:27:43 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=446) DEBUG 04-22 00:27:43 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=77.71GiB, total_memory=79.19GiB, cuda_memory=1.48GiB, torch_memory=0.0GiB, non_torch_memory=1.48GiB, timestamp=1776817663.2081609, auto_measure=True +(Worker pid=446) DEBUG 04-22 00:27:43 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=442) DEBUG 04-22 00:27:43 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=77.71GiB, total_memory=79.19GiB, cuda_memory=1.48GiB, torch_memory=0.0GiB, non_torch_memory=1.48GiB, timestamp=1776817663.2475388, auto_measure=True +(Worker pid=442) DEBUG 04-22 00:27:43 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=445) DEBUG 04-22 00:27:43 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=77.71GiB, total_memory=79.19GiB, cuda_memory=1.48GiB, torch_memory=0.0GiB, non_torch_memory=1.48GiB, timestamp=1776817663.2508624, auto_measure=True +(Worker pid=445) DEBUG 04-22 00:27:43 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=444) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=443) DEBUG 04-22 00:27:43 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=77.71GiB, total_memory=79.19GiB, cuda_memory=1.48GiB, torch_memory=0.0GiB, non_torch_memory=1.48GiB, timestamp=1776817663.290896, auto_measure=True +(Worker pid=443) DEBUG 04-22 00:27:43 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=444) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=444) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=446) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=446) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=446) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=444) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=442) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=442) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=446) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=444) DEBUG 04-22 00:27:43 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=445) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=445) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=442) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=446) DEBUG 04-22 00:27:43 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=445) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=442) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=443) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=443) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=445) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=442) DEBUG 04-22 00:27:43 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(Worker pid=443) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=442) DEBUG 04-22 00:27:43 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=445) DEBUG 04-22 00:27:43 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=443) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=443) DEBUG 04-22 00:27:43 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=442) DEBUG 04-22 00:27:43 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(Worker_TP0 pid=442) INFO 04-22 00:27:43 [v1/worker/gpu_model_runner.py:4735] Starting to load model Qwen/Qwen3-14B... +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] WorkerProc failed to start. +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] Traceback (most recent call last): +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 826, in worker_main +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] worker = WorkerProc(*args, **kwargs) +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 613, in __init__ +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.worker.load_model() +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 323, in load_model +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.model_runner.load_model(load_dummy_weights=load_dummy_weights) +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 4751, in load_model +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.model = model_loader.load_model( +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/base_loader.py", line 55, in load_model +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] model = initialize_model( +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^ +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/utils.py", line 57, in initialize_model +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] model = model_class(vllm_config=vllm_config, prefix=prefix) +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3.py", line 289, in __init__ +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.model = Qwen3Model( +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^ +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 364, in __init__ +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] old_init(self, **kwargs) +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3.py", line 256, in __init__ +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] super().__init__( +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 364, in __init__ +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] old_init(self, **kwargs) +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py", line 391, in __init__ +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.embed_tokens = VocabParallelEmbedding( +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 254, in __init__ +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.shard_indices = self._get_indices( +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^ +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 329, in _get_indices +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] vocab_range_from_global_vocab_size(org_vocab_size_padded, tp_rank, tp_size) +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 91, in vocab_range_from_global_vocab_size +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] per_partition_vocab_size = divide(global_vocab_size, world_size) +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/utils.py", line 63, in divide +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ensure_divisibility(numerator, denominator) +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/utils.py", line 55, in ensure_divisibility +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] assert numerator % denominator == 0, "{} is not divisible by {}".format( +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] AssertionError: 151936 is not divisible by 5 +(EngineCore pid=243) DEBUG 04-22 00:27:44 [v1/executor/multiproc_executor.py:419] Worker Termination: allow workers to gracefully shutdown +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] WorkerProc failed to start. +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] Traceback (most recent call last): +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 826, in worker_main +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] worker = WorkerProc(*args, **kwargs) +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 613, in __init__ +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.worker.load_model() +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 323, in load_model +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.model_runner.load_model(load_dummy_weights=load_dummy_weights) +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 4751, in load_model +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.model = model_loader.load_model( +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/base_loader.py", line 55, in load_model +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] model = initialize_model( +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^ +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/utils.py", line 57, in initialize_model +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] model = model_class(vllm_config=vllm_config, prefix=prefix) +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3.py", line 289, in __init__ +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.model = Qwen3Model( +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^ +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 364, in __init__ +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] old_init(self, **kwargs) +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3.py", line 256, in __init__ +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] super().__init__( +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 364, in __init__ +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] old_init(self, **kwargs) +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py", line 391, in __init__ +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.embed_tokens = VocabParallelEmbedding( +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 254, in __init__ +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.shard_indices = self._get_indices( +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^ +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 329, in _get_indices +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] vocab_range_from_global_vocab_size(org_vocab_size_padded, tp_rank, tp_size) +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 91, in vocab_range_from_global_vocab_size +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] per_partition_vocab_size = divide(global_vocab_size, world_size) +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/utils.py", line 63, in divide +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ensure_divisibility(numerator, denominator) +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/utils.py", line 55, in ensure_divisibility +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] assert numerator % denominator == 0, "{} is not divisible by {}".format( +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] AssertionError: 151936 is not divisible by 5 +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] WorkerProc failed to start. +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] Traceback (most recent call last): +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 826, in worker_main +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] worker = WorkerProc(*args, **kwargs) +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 613, in __init__ +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.worker.load_model() +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 323, in load_model +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.model_runner.load_model(load_dummy_weights=load_dummy_weights) +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 4751, in load_model +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.model = model_loader.load_model( +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/base_loader.py", line 55, in load_model +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] model = initialize_model( +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^ +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/utils.py", line 57, in initialize_model +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] model = model_class(vllm_config=vllm_config, prefix=prefix) +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3.py", line 289, in __init__ +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.model = Qwen3Model( +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^ +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 364, in __init__ +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] old_init(self, **kwargs) +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3.py", line 256, in __init__ +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] super().__init__( +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 364, in __init__ +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] old_init(self, **kwargs) +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py", line 391, in __init__ +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.embed_tokens = VocabParallelEmbedding( +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 254, in __init__ +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.shard_indices = self._get_indices( +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^ +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 329, in _get_indices +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] vocab_range_from_global_vocab_size(org_vocab_size_padded, tp_rank, tp_size) +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 91, in vocab_range_from_global_vocab_size +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] per_partition_vocab_size = divide(global_vocab_size, world_size) +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/utils.py", line 63, in divide +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ensure_divisibility(numerator, denominator) +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/utils.py", line 55, in ensure_divisibility +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] assert numerator % denominator == 0, "{} is not divisible by {}".format( +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] AssertionError: 151936 is not divisible by 5 +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] WorkerProc failed to start. +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] Traceback (most recent call last): +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 826, in worker_main +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] worker = WorkerProc(*args, **kwargs) +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 613, in __init__ +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.worker.load_model() +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 323, in load_model +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.model_runner.load_model(load_dummy_weights=load_dummy_weights) +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 4751, in load_model +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.model = model_loader.load_model( +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/base_loader.py", line 55, in load_model +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] model = initialize_model( +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^ +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/utils.py", line 57, in initialize_model +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] model = model_class(vllm_config=vllm_config, prefix=prefix) +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3.py", line 289, in __init__ +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.model = Qwen3Model( +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^ +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 364, in __init__ +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] old_init(self, **kwargs) +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3.py", line 256, in __init__ +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] super().__init__( +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 364, in __init__ +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] old_init(self, **kwargs) +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py", line 391, in __init__ +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.embed_tokens = VocabParallelEmbedding( +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 254, in __init__ +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.shard_indices = self._get_indices( +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^ +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 329, in _get_indices +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] vocab_range_from_global_vocab_size(org_vocab_size_padded, tp_rank, tp_size) +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 91, in vocab_range_from_global_vocab_size +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] per_partition_vocab_size = divide(global_vocab_size, world_size) +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/utils.py", line 63, in divide +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ensure_divisibility(numerator, denominator) +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/utils.py", line 55, in ensure_divisibility +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] assert numerator % denominator == 0, "{} is not divisible by {}".format( +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] AssertionError: 151936 is not divisible by 5 +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] WorkerProc failed to start. +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] Traceback (most recent call last): +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 826, in worker_main +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] worker = WorkerProc(*args, **kwargs) +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 613, in __init__ +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.worker.load_model() +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 323, in load_model +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.model_runner.load_model(load_dummy_weights=load_dummy_weights) +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 4751, in load_model +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.model = model_loader.load_model( +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/base_loader.py", line 55, in load_model +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] model = initialize_model( +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^ +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/utils.py", line 57, in initialize_model +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] model = model_class(vllm_config=vllm_config, prefix=prefix) +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3.py", line 289, in __init__ +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.model = Qwen3Model( +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^ +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 364, in __init__ +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] old_init(self, **kwargs) +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3.py", line 256, in __init__ +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] super().__init__( +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 364, in __init__ +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] old_init(self, **kwargs) +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py", line 391, in __init__ +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.embed_tokens = VocabParallelEmbedding( +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 254, in __init__ +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.shard_indices = self._get_indices( +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^ +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 329, in _get_indices +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] vocab_range_from_global_vocab_size(org_vocab_size_padded, tp_rank, tp_size) +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 91, in vocab_range_from_global_vocab_size +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] per_partition_vocab_size = divide(global_vocab_size, world_size) +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/utils.py", line 63, in divide +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ensure_divisibility(numerator, denominator) +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/utils.py", line 55, in ensure_divisibility +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] assert numerator % denominator == 0, "{} is not divisible by {}".format( +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] AssertionError: 151936 is not divisible by 5 +[rank0]:[W422 00:27:45.545461215 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) +(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] EngineCore failed to start. +(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] Traceback (most recent call last): +(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1082, in run_engine_core +(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs) +(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] return func(*args, **kwargs) +(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^ +(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 848, in __init__ +(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] super().__init__( +(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 114, in __init__ +(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] self.model_executor = executor_class(vllm_config) +(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 101, in __init__ +(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] super().__init__(vllm_config) +(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] return func(*args, **kwargs) +(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^ +(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 103, in __init__ +(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] self._init_executor() +(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 190, in _init_executor +(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] self.workers = WorkerProc.wait_for_ready(unready_workers) +(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 736, in wait_for_ready +(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] raise e from None +(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] Exception: WorkerProc initialization failed due to an exception in a background process. See stack trace for root cause. +(EngineCore pid=243) Process EngineCore: +(EngineCore pid=243) Traceback (most recent call last): +(EngineCore pid=243) File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap +(EngineCore pid=243) self.run() +(EngineCore pid=243) File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run +(EngineCore pid=243) self._target(*self._args, **self._kwargs) +(EngineCore pid=243) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1112, in run_engine_core +(EngineCore pid=243) raise e +(EngineCore pid=243) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1082, in run_engine_core +(EngineCore pid=243) engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs) +(EngineCore pid=243) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore pid=243) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(EngineCore pid=243) return func(*args, **kwargs) +(EngineCore pid=243) ^^^^^^^^^^^^^^^^^^^^^ +(EngineCore pid=243) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 848, in __init__ +(EngineCore pid=243) super().__init__( +(EngineCore pid=243) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 114, in __init__ +(EngineCore pid=243) self.model_executor = executor_class(vllm_config) +(EngineCore pid=243) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore pid=243) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 101, in __init__ +(EngineCore pid=243) super().__init__(vllm_config) +(EngineCore pid=243) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(EngineCore pid=243) return func(*args, **kwargs) +(EngineCore pid=243) ^^^^^^^^^^^^^^^^^^^^^ +(EngineCore pid=243) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 103, in __init__ +(EngineCore pid=243) self._init_executor() +(EngineCore pid=243) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 190, in _init_executor +(EngineCore pid=243) self.workers = WorkerProc.wait_for_ready(unready_workers) +(EngineCore pid=243) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore pid=243) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 736, in wait_for_ready +(EngineCore pid=243) raise e from None +(EngineCore pid=243) Exception: WorkerProc initialization failed due to an exception in a background process. See stack trace for root cause. +(EngineCore pid=243) DEBUG 04-22 00:27:46 [v1/executor/multiproc_executor.py:438] Triggering shutdown of workers +(APIServer pid=1) Traceback (most recent call last): +(APIServer pid=1) File "/usr/local/bin/vllm", line 10, in +(APIServer pid=1) sys.exit(main()) +(APIServer pid=1) ^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main +(APIServer pid=1) args.dispatch_function(args) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd +(APIServer pid=1) uvloop.run(run_server(args)) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run +(APIServer pid=1) return __asyncio.run( +(APIServer pid=1) ^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run +(APIServer pid=1) return runner.run(main) +(APIServer pid=1) ^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run +(APIServer pid=1) return self._loop.run_until_complete(task) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper +(APIServer pid=1) return await main +(APIServer pid=1) ^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server +(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker +(APIServer pid=1) async with build_async_engine_client( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ +(APIServer pid=1) return await anext(self.gen) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client +(APIServer pid=1) async with build_async_engine_client_from_engine_args( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ +(APIServer pid=1) return await anext(self.gen) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 136, in build_async_engine_client_from_engine_args +(APIServer pid=1) async_llm = AsyncLLM.from_vllm_config( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 225, in from_vllm_config +(APIServer pid=1) return cls( +(APIServer pid=1) ^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 154, in __init__ +(APIServer pid=1) self.engine_core = EngineCoreClient.make_async_mp_client( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(APIServer pid=1) return func(*args, **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 130, in make_async_mp_client +(APIServer pid=1) return AsyncMPClient(*client_args) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(APIServer pid=1) return func(*args, **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 887, in __init__ +(APIServer pid=1) super().__init__( +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 535, in __init__ +(APIServer pid=1) with launch_core_engines( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 144, in __exit__ +(APIServer pid=1) next(self.gen) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 998, in launch_core_engines +(APIServer pid=1) wait_for_engine_startup( +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 1057, in wait_for_engine_startup +(APIServer pid=1) raise RuntimeError( +(APIServer pid=1) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} +/usr/lib/python3.12/multiprocessing/resource_tracker.py:279: UserWarning: resource_tracker: There appear to be 1 leaked shared_memory objects to clean up at shutdown + warnings.warn('resource_tracker: There appear to be %d ' diff --git a/accuracy/results/v0.19.0/logs/qwen-qwen3-30b-a3b--h100-80gb--tp1pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/qwen-qwen3-30b-a3b--h100-80gb--tp1pp1dp1--8192.log new file mode 100644 index 00000000..98b73b84 --- /dev/null +++ b/accuracy/results/v0.19.0/logs/qwen-qwen3-30b-a3b--h100-80gb--tp1pp1dp1--8192.log @@ -0,0 +1,774 @@ +DEBUG 04-22 01:06:00 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:06:00 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:06:00 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:06:00 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:06:00 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:06:00 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:06:00 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:06:00 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:06:00 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:06:00 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:06:00 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:06:00 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:06:04 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:06:06 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' +DEBUG 04-22 01:06:06 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:06:06 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:06:06 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:06:06 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 01:06:06 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:06:06 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 01:06:06 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 01:06:06 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model Qwen/Qwen3-30B-A3B +(APIServer pid=1) INFO 04-22 01:06:06 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 01:06:06 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:06:06 [entrypoints/utils.py:233] non-default args: {'model_tag': 'Qwen/Qwen3-30B-A3B', 'model': 'Qwen/Qwen3-30B-A3B', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 01:06:06 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 01:06:07 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen3_moe.Qwen3MoeForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 01:06:07 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0016402 secs +(APIServer pid=1) INFO 04-22 01:06:07 [config/model.py:549] Resolved architecture: Qwen3MoeForCausalLM +(APIServer pid=1) INFO 04-22 01:06:07 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 01:06:07 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 01:06:07 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 01:06:07 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 01:06:07 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 01:06:07 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 01:06:07 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 01:06:07 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 01:06:08 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 01:06:08 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:06:09 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:06:09 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 01:06:13 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:06:13 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:06:13 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:06:13 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:06:13 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:06:13 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:06:13 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:06:13 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:06:13 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:06:13 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:06:13 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:06:13 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:06:18 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=243) DEBUG 04-22 01:06:19 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 01:06:19 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=243) DEBUG 04-22 01:06:19 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/3e2c30c9-94dc-4d97-b759-8f269957db68'], outputs=['ipc:///tmp/c9416175-aef4-4b66-9761-7180c0c8b868'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=243) DEBUG 04-22 01:06:19 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=243) DEBUG 04-22 01:06:19 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=243) DEBUG 04-22 01:06:19 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=243) DEBUG 04-22 01:06:19 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=243) DEBUG 04-22 01:06:19 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=243) INFO 04-22 01:06:19 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='Qwen/Qwen3-30B-A3B', speculative_config=None, tokenizer='Qwen/Qwen3-30B-A3B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen3-30B-A3B, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=243) DEBUG 04-22 01:06:20 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.131.7.86:46973 backend=nccl +(EngineCore pid=243) INFO 04-22 01:06:20 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.131.7.86:46973 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=243) DEBUG 04-22 01:06:20 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=243) INFO 04-22 01:06:20 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N/A +(EngineCore pid=243) DEBUG 04-22 01:06:20 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776819980.5765347, auto_measure=True +(EngineCore pid=243) DEBUG 04-22 01:06:20 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=243) DEBUG 04-22 01:06:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=243) DEBUG 04-22 01:06:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=243) DEBUG 04-22 01:06:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=243) DEBUG 04-22 01:06:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=243) DEBUG 04-22 01:06:20 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=243) DEBUG 04-22 01:06:20 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=243) DEBUG 04-22 01:06:20 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=243) DEBUG 04-22 01:06:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=243) INFO 04-22 01:06:20 [v1/worker/gpu_model_runner.py:4735] Starting to load model Qwen/Qwen3-30B-A3B... +(EngineCore pid=243) DEBUG 04-22 01:06:21 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=243) INFO 04-22 01:06:21 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=243) INFO 04-22 01:06:21 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=243) INFO 04-22 01:06:21 [model_executor/.../oracle/unquantized.py:186] Using TRITON backend for Unquantized MoE +(EngineCore pid=243) DEBUG 04-22 01:06:21 [model_executor/.../runner/default_moe_runner.py:240] Enabled separate cuda stream for MoE shared_experts +(EngineCore pid=243) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=243) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=243) DEBUG 04-22 01:06:21 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=243) DEBUG 04-22 01:06:21 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=243) DEBUG 04-22 01:06:21 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 193, 'fused_moe': 48, 'unquantized_fused_moe': 48, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=243) DEBUG 04-22 01:06:21 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=243) DEBUG 04-22 01:06:21 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00013-of-00016.safetensors', 'model-00005-of-00016.safetensors', 'model-00004-of-00016.safetensors', 'model-00007-of-00016.safetensors', 'model-00006-of-00016.safetensors', 'model-00016-of-00016.safetensors', 'model-00009-of-00016.safetensors', 'model-00003-of-00016.safetensors', 'model-00002-of-00016.safetensors', 'model-00015-of-00016.safetensors', 'model-00010-of-00016.safetensors', 'model-00011-of-00016.safetensors', 'model-00001-of-00016.safetensors', 'model-00012-of-00016.safetensors', 'model-00008-of-00016.safetensors', 'model-00014-of-00016.safetensors']] +(EngineCore pid=243) Loading safetensors checkpoint shards: 0% Completed | 0/16 [00:00 +(APIServer pid=1) DEBUG 04-22 01:07:19 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/forward_context.py +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/config.py +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/shared_fused_moe.py +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_moe.py +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=243) INFO 04-22 01:07:24 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/2fc27d8a46/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=496d7d5de7 comp=e546579c48 code=691aa59361ebbac2850a48ed2c9a2c8014c83c0f1cd12fbe5f19c3ebb373a13e dir=/data/.cache/vllm/torch_compile_cache/2fc27d8a46/rank_0_0/backbone +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] Vllm config hash: 496d7d5de7 +(EngineCore pid=243) INFO 04-22 01:07:24 [compilation/backends.py:1111] Dynamo bytecode transform time: 6.86 s +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=243) DEBUG 04-22 01:07:26 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 01:07:26 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(EngineCore pid=243) DEBUG 04-22 01:07:26 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms +(EngineCore pid=243) DEBUG 04-22 01:07:26 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 01:07:26 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(APIServer pid=1) DEBUG 04-22 01:07:29 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=243) INFO 04-22 01:07:29 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=243) DEBUG 04-22 01:07:29 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/2fc27d8a46/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=243) DEBUG 04-22 01:07:30 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 01:07:30 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(EngineCore pid=243) DEBUG 04-22 01:07:30 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms +(EngineCore pid=243) DEBUG 04-22 01:07:30 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 01:07:30 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=243) DEBUG 04-22 01:07:31 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/2fc27d8a46/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=243) DEBUG 04-22 01:07:32 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 01:07:32 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(EngineCore pid=243) DEBUG 04-22 01:07:32 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.7 ms +(EngineCore pid=243) DEBUG 04-22 01:07:32 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 01:07:32 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(EngineCore pid=243) DEBUG 04-22 01:07:32 [compilation/backends.py:377] Store the 48-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_48', '/data/.cache/vllm/torch_compile_cache/2fc27d8a46/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_48') +(EngineCore pid=243) INFO 04-22 01:07:32 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 7.91 s +(EngineCore pid=243) DEBUG 04-22 01:07:33 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/2fc27d8a46/rank_0_0/backbone/computation_graph.py +(EngineCore pid=243) INFO 04-22 01:07:34 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/aa66eecb906a0de896d6e1e95b50437f56fffa80eec8e3863e8ea6c4af7925d3/rank_0_0/model +(EngineCore pid=243) INFO 04-22 01:07:34 [compilation/monitor.py:48] torch.compile took 17.11 s in total +(EngineCore pid=243) WARNING 04-22 01:07:35 [model_executor/.../fused_moe/fused_moe.py:1090] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json +(EngineCore pid=243) INFO 04-22 01:07:35 [compilation/monitor.py:76] Initial profiling/warmup run took 1.08 s +(APIServer pid=1) DEBUG 04-22 01:07:39 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=243) INFO 04-22 01:07:41 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=243) DEBUG 04-22 01:07:41 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=243) INFO 04-22 01:07:41 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=243) DEBUG 04-22 01:07:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:07:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:07:42 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 01:07:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:07:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:07:42 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 01:07:42 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 108.00 MiB first-capture + (51-1) × 10.00 MiB per-graph +(EngineCore pid=243) DEBUG 04-22 01:07:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:07:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:07:42 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 01:07:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:07:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:07:42 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 01:07:42 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 266.00 MiB first-capture + (51-1) × 10.00 MiB per-graph +(EngineCore pid=243) DEBUG 04-22 01:07:43 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=243) INFO 04-22 01:07:43 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.24 GiB total +(EngineCore pid=243) DEBUG 04-22 01:07:43 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=243) DEBUG 04-22 01:07:43 [v1/worker/gpu_worker.py:430] Free memory after profiling: 20.74 GiB (total), 17.29 GiB (within requested) +(EngineCore pid=243) DEBUG 04-22 01:07:43 [v1/worker/gpu_worker.py:435] Memory profiling takes 25.66 seconds. Total non KV cache memory: 59.82GiB; torch peak memory increase: 2.68GiB; non-torch forward increase memory: 0.27GiB; weights memory: 56.88GiB. +(EngineCore pid=243) INFO 04-22 01:07:43 [v1/worker/gpu_worker.py:436] Available KV cache memory: 15.41 GiB +(EngineCore pid=243) INFO 04-22 01:07:43 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9656 to maintain the same effective KV cache size. +(EngineCore pid=243) INFO 04-22 01:07:43 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 168,288 tokens +(EngineCore pid=243) INFO 04-22 01:07:43 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 20.54x +(EngineCore pid=243) 2026-04-22 01:07:43,491 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=243) DEBUG 04-22 01:07:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) 2026-04-22 01:07:43,521 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=243) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:04:39 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:04:39 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 01:04:39 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:04:39 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 01:04:39 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 01:04:39 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model Qwen/Qwen3-8B +(APIServer pid=1) INFO 04-22 01:04:39 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 01:04:39 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:04:39 [entrypoints/utils.py:233] non-default args: {'model_tag': 'Qwen/Qwen3-8B', 'model': 'Qwen/Qwen3-8B', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 01:04:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 01:04:39 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen3.Qwen3ForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 01:04:39 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0006517 secs +(APIServer pid=1) INFO 04-22 01:04:39 [config/model.py:549] Resolved architecture: Qwen3ForCausalLM +(APIServer pid=1) INFO 04-22 01:04:39 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 01:04:39 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 01:04:39 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 01:04:39 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 01:04:39 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 01:04:39 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 01:04:39 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 01:04:39 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 01:04:39 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 01:04:39 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:04:40 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:04:40 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 01:04:44 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:04:44 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:04:44 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:04:44 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:04:44 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:04:44 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:04:44 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:04:44 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:04:44 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:04:44 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:04:44 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:04:44 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:04:49 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(APIServer pid=1) DEBUG 04-22 01:04:50 [v1/engine/utils.py:1042] Waiting for 1 local, 0 remote core engine proc(s) to connect. +(EngineCore pid=243) DEBUG 04-22 01:04:50 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 01:04:50 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=243) DEBUG 04-22 01:04:50 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/db739ab4-58eb-4aeb-b654-24c2702f39ba'], outputs=['ipc:///tmp/c68fa860-9ede-4935-81bc-ed8c6b4d4677'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=243) DEBUG 04-22 01:04:50 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=243) DEBUG 04-22 01:04:50 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=243) DEBUG 04-22 01:04:50 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=243) DEBUG 04-22 01:04:50 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=243) DEBUG 04-22 01:04:50 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=243) INFO 04-22 01:04:50 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='Qwen/Qwen3-8B', speculative_config=None, tokenizer='Qwen/Qwen3-8B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen3-8B, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=243) DEBUG 04-22 01:04:51 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.7.35:33327 backend=nccl +(EngineCore pid=243) INFO 04-22 01:04:51 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.7.35:33327 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=243) DEBUG 04-22 01:04:51 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=243) INFO 04-22 01:04:51 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=243) DEBUG 04-22 01:04:51 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776819891.829098, auto_measure=True +(EngineCore pid=243) DEBUG 04-22 01:04:51 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=243) DEBUG 04-22 01:04:51 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=243) DEBUG 04-22 01:04:51 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=243) DEBUG 04-22 01:04:51 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=243) DEBUG 04-22 01:04:51 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=243) DEBUG 04-22 01:04:51 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=243) DEBUG 04-22 01:04:51 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=243) DEBUG 04-22 01:04:52 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=243) INFO 04-22 01:04:52 [v1/worker/gpu_model_runner.py:4735] Starting to load model Qwen/Qwen3-8B... +(EngineCore pid=243) DEBUG 04-22 01:04:52 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=243) INFO 04-22 01:04:52 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=243) INFO 04-22 01:04:52 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=243) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=243) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=243) DEBUG 04-22 01:04:52 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=243) DEBUG 04-22 01:04:52 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=243) DEBUG 04-22 01:04:52 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=243) DEBUG 04-22 01:04:52 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 145, 'silu_and_mul': 36, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=243) DEBUG 04-22 01:04:52 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=243) DEBUG 04-22 01:04:53 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00005-of-00005.safetensors', 'model-00003-of-00005.safetensors', 'model-00004-of-00005.safetensors', 'model-00002-of-00005.safetensors', 'model-00001-of-00005.safetensors']] +(EngineCore pid=243) Loading safetensors checkpoint shards: 0% Completed | 0/5 [00:00 +(APIServer pid=1) DEBUG 04-22 01:05:10 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3.py +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=243) INFO 04-22 01:05:13 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/709aca0718/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=500ddfe46b comp=e546579c48 code=318216e61e692fc38536a6acea806e2e636550278ab73e70427008e98d48a0f3 dir=/data/.cache/vllm/torch_compile_cache/709aca0718/rank_0_0/backbone +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] Vllm config hash: 500ddfe46b +(EngineCore pid=243) INFO 04-22 01:05:13 [compilation/backends.py:1111] Dynamo bytecode transform time: 5.60 s +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=243) DEBUG 04-22 01:05:14 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 01:05:14 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(EngineCore pid=243) DEBUG 04-22 01:05:14 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.0 ms +(EngineCore pid=243) DEBUG 04-22 01:05:14 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 01:05:14 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=243) INFO 04-22 01:05:18 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=243) DEBUG 04-22 01:05:18 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/709aca0718/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=243) DEBUG 04-22 01:05:18 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 01:05:18 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(EngineCore pid=243) DEBUG 04-22 01:05:18 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.3 ms +(EngineCore pid=243) DEBUG 04-22 01:05:18 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 01:05:18 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(APIServer pid=1) DEBUG 04-22 01:05:20 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=243) DEBUG 04-22 01:05:21 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/709aca0718/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=243) DEBUG 04-22 01:05:22 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 01:05:22 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms +(EngineCore pid=243) DEBUG 04-22 01:05:22 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms +(EngineCore pid=243) DEBUG 04-22 01:05:22 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 01:05:22 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(EngineCore pid=243) DEBUG 04-22 01:05:22 [compilation/backends.py:377] Store the 36-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_36', '/data/.cache/vllm/torch_compile_cache/709aca0718/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_36') +(EngineCore pid=243) INFO 04-22 01:05:22 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 8.58 s +(EngineCore pid=243) DEBUG 04-22 01:05:22 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/709aca0718/rank_0_0/backbone/computation_graph.py +(EngineCore pid=243) INFO 04-22 01:05:23 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/1b1f0768e3b39bb5337d62f50b4457b3465da85a5edcee9089ee51445148b822/rank_0_0/model +(EngineCore pid=243) INFO 04-22 01:05:23 [compilation/monitor.py:48] torch.compile took 15.63 s in total +(EngineCore pid=243) INFO 04-22 01:05:24 [compilation/monitor.py:76] Initial profiling/warmup run took 0.56 s +(EngineCore pid=243) INFO 04-22 01:05:29 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=243) DEBUG 04-22 01:05:29 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=243) INFO 04-22 01:05:29 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=243) DEBUG 04-22 01:05:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:05:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:05:30 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 01:05:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:05:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:05:30 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 01:05:30 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 130.00 MiB first-capture + (51-1) × 8.00 MiB per-graph +(EngineCore pid=243) DEBUG 04-22 01:05:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:05:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:05:30 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 01:05:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:05:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:05:30 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 01:05:30 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 262.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(APIServer pid=1) DEBUG 04-22 01:05:30 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=243) DEBUG 04-22 01:05:30 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=243) INFO 04-22 01:05:30 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.84 GiB total +(EngineCore pid=243) DEBUG 04-22 01:05:31 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=243) DEBUG 04-22 01:05:31 [v1/worker/gpu_worker.py:430] Free memory after profiling: 61.84 GiB (total), 58.39 GiB (within requested) +(EngineCore pid=243) DEBUG 04-22 01:05:31 [v1/worker/gpu_worker.py:435] Memory profiling takes 23.00 seconds. Total non KV cache memory: 17.73GiB; torch peak memory increase: 2.21GiB; non-torch forward increase memory: 0.25GiB; weights memory: 15.27GiB. +(EngineCore pid=243) INFO 04-22 01:05:31 [v1/worker/gpu_worker.py:436] Available KV cache memory: 57.5 GiB +(EngineCore pid=243) INFO 04-22 01:05:31 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9606 to maintain the same effective KV cache size. +(EngineCore pid=243) INFO 04-22 01:05:31 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 418,688 tokens +(EngineCore pid=243) INFO 04-22 01:05:31 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 51.11x +(EngineCore pid=243) 2026-04-22 01:05:31,167 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=243) DEBUG 04-22 01:05:31 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) 2026-04-22 01:05:31,180 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=243) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:55:22 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:55:22 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 01:55:22 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:55:22 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 01:55:22 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 01:55:22 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model redhatai/Llama-3.3-70B-Instruct-quantized.w8a8 +(APIServer pid=1) INFO 04-22 01:55:22 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 01:55:22 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:55:22 [entrypoints/utils.py:233] non-default args: {'model_tag': 'redhatai/Llama-3.3-70B-Instruct-quantized.w8a8', 'model': 'redhatai/Llama-3.3-70B-Instruct-quantized.w8a8', 'max_model_len': 8192, 'tensor_parallel_size': 2, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 01:55:22 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 01:55:23 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 01:55:23 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003789 secs +(APIServer pid=1) INFO 04-22 01:55:23 [config/model.py:549] Resolved architecture: LlamaForCausalLM +(APIServer pid=1) INFO 04-22 01:55:23 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 01:55:23 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 01:55:23 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 01:55:23 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 01:55:23 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 01:55:23 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 01:55:23 [config/parallel.py:743] Defaulting to use mp for distributed inference +(APIServer pid=1) INFO 04-22 01:55:23 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 01:55:23 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(APIServer pid=1) INFO 04-22 01:55:24 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(APIServer pid=1) DEBUG 04-22 01:55:24 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 01:55:24 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:55:25 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:55:25 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 01:55:28 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:55:28 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:55:28 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:55:28 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:55:28 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:55:28 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:55:28 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:55:28 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:55:28 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:55:28 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:55:28 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:55:28 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:55:33 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=244) DEBUG 04-22 01:55:35 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 01:55:35 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=244) DEBUG 04-22 01:55:35 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/1701c665-d91f-4876-9ebb-bf4915d36362'], outputs=['ipc:///tmp/7ea2c92f-1e51-48db-9181-6033967a2814'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=244) DEBUG 04-22 01:55:35 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=244) DEBUG 04-22 01:55:35 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=244) DEBUG 04-22 01:55:35 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=244) DEBUG 04-22 01:55:35 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=244) DEBUG 04-22 01:55:35 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=244) INFO 04-22 01:55:35 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='redhatai/Llama-3.3-70B-Instruct-quantized.w8a8', speculative_config=None, tokenizer='redhatai/Llama-3.3-70B-Instruct-quantized.w8a8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=redhatai/Llama-3.3-70B-Instruct-quantized.w8a8, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [4096, 8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=244) WARNING 04-22 01:55:35 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. +(EngineCore pid=244) INFO 04-22 01:55:35 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.131.3.126 (local), world_size=2, local_world_size=2 +(EngineCore pid=244) DEBUG 04-22 01:55:35 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/ea3f9a82-825d-40ce-bbcc-2ddc6e815723 +(EngineCore pid=244) DEBUG 04-22 01:55:35 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1], buffer_handle=(2, 16777216, 10, 'psm_8477034d'), local_subscribe_addr='ipc:///tmp/ea3f9a82-825d-40ce-bbcc-2ddc6e815723', local_notify_addr='ipc:///tmp/ce075078-4372-434e-85ca-c7a0d0052cd6', remote_subscribe_addr=None, remote_addr_ipv6=False) +DEBUG 04-22 01:55:38 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:55:38 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:55:38 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:55:38 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:55:38 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:55:38 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:55:38 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:55:38 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:55:38 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:55:38 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:55:38 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:55:38 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:55:38 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:55:38 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:55:38 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:55:38 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:55:38 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:55:38 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:55:38 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:55:38 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:55:38 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:55:38 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:55:38 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:55:38 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:55:43 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:55:43 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:55:45 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:55:45 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:55:45 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:55:45 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 01:55:45 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:55:45 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:55:45 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:55:45 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) DEBUG 04-22 01:55:45 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker pid=443) DEBUG 04-22 01:55:45 [distributed/parallel_state.py:1356] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:58613 backend=nccl +(Worker pid=443) INFO 04-22 01:55:45 [distributed/parallel_state.py:1400] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:58613 backend=nccl +(Worker pid=444) DEBUG 04-22 01:55:45 [distributed/parallel_state.py:1356] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:58613 backend=nccl +(Worker pid=444) INFO 04-22 01:55:45 [distributed/parallel_state.py:1400] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:58613 backend=nccl +[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +(Worker pid=444) DEBUG 04-22 01:55:45 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=443) DEBUG 04-22 01:55:45 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +(Worker pid=443) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=444) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=443) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=444) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=443) DEBUG 04-22 01:55:46 [utils/nccl.py:34] Found nccl from library libnccl.so.2 +(Worker pid=443) INFO 04-22 01:55:46 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 +(Worker pid=443) DEBUG 04-22 01:55:46 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=444) DEBUG 04-22 01:55:46 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=443) DEBUG 04-22 01:55:46 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/08771257-3d22-4234-b496-55ad1ccdf616 +(Worker pid=443) DEBUG 04-22 01:55:46 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1], buffer_handle=(1, 4194304, 6, 'psm_6004ee6f'), local_subscribe_addr='ipc:///tmp/08771257-3d22-4234-b496-55ad1ccdf616', local_notify_addr='ipc:///tmp/acdfd856-dfd1-46a2-8bd0-c53fd5031672', remote_subscribe_addr=None, remote_addr_ipv6=False) +(Worker pid=444) DEBUG 04-22 01:55:46 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/08771257-3d22-4234-b496-55ad1ccdf616 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(Worker pid=443) INFO 04-22 01:55:46 [distributed/parallel_state.py:1716] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(Worker pid=443) DEBUG 04-22 01:55:47 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.66GiB, total_memory=79.19GiB, cuda_memory=1.53GiB, torch_memory=0.02GiB, non_torch_memory=1.51GiB, timestamp=1776822947.147593, auto_measure=True +(Worker pid=443) DEBUG 04-22 01:55:47 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=444) DEBUG 04-22 01:55:47 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.66GiB, total_memory=79.19GiB, cuda_memory=1.53GiB, torch_memory=0.02GiB, non_torch_memory=1.51GiB, timestamp=1776822947.1658504, auto_measure=True +(Worker pid=444) DEBUG 04-22 01:55:47 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=443) DEBUG 04-22 01:55:47 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=443) DEBUG 04-22 01:55:47 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=443) DEBUG 04-22 01:55:47 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=444) DEBUG 04-22 01:55:47 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=444) DEBUG 04-22 01:55:47 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=444) DEBUG 04-22 01:55:47 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=443) DEBUG 04-22 01:55:47 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=443) DEBUG 04-22 01:55:47 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(Worker pid=443) DEBUG 04-22 01:55:47 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=444) DEBUG 04-22 01:55:47 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=444) DEBUG 04-22 01:55:47 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=443) DEBUG 04-22 01:55:47 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(Worker_TP0 pid=443) INFO 04-22 01:55:47 [v1/worker/gpu_model_runner.py:4735] Starting to load model redhatai/Llama-3.3-70B-Instruct-quantized.w8a8... +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.0.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.0.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.0.self_attn.qkv_proj +(Worker_TP0 pid=443) INFO 04-22 01:55:47 [model_executor/.../linear/__init__.py:291] Selected CutlassInt8ScaledMMLinearKernel for CompressedTensorsW8A8Int8 +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.0.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.0.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(Worker_TP0 pid=443) INFO 04-22 01:55:47 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(Worker_TP0 pid=443) INFO 04-22 01:55:47 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.0.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.0.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.0.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.1.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.1.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.1.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.1.self_attn.qkv_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.1.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.1.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.1.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.1.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.2.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.2.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.2.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.2.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.2.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.2.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.2.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.2.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.3.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.3.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.3.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.3.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.3.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.3.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.3.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.3.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.4.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.4.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.4.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.4.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.4.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.4.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.4.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.4.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.5.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.5.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.5.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.5.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.5.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.5.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.5.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.5.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.6.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.6.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.6.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.6.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.6.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.6.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.6.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.6.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.7.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.7.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.7.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.7.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.7.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.7.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.7.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.7.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.8.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.8.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.8.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.8.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.8.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.8.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.8.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.8.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.9.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.9.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.9.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.9.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.9.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.9.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.9.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.9.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.10.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.10.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.10.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.10.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.10.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.10.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.10.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.10.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.11.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.11.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.11.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.11.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.11.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.11.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.11.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.11.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.12.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.12.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.12.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.12.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.12.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.12.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.12.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.12.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.13.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.13.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.13.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.13.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.13.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.13.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.13.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.13.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.14.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.14.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.14.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.14.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.14.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.14.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.14.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.14.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.15.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.15.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.15.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.15.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.15.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.15.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.15.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.15.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.16.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.16.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.16.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.16.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.16.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.16.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.16.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.16.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.17.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.17.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.17.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.17.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.17.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.17.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.17.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.17.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.18.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.18.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.18.self_attn.qkv_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.18.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.18.mlp.gate_up_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.18.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.18.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.18.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.19.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.19.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.19.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.19.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.19.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.19.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.19.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.19.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.20.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.20.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.20.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.20.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.20.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.20.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.20.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.20.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.21.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.21.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.21.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.21.mlp.gate_up_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.21.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.21.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.21.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.21.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.22.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.22.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.22.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.22.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.22.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.22.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.22.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.22.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.23.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.23.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.23.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.23.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.23.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.23.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.23.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.23.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.24.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.24.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.24.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.24.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.24.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.24.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.24.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.24.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.25.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.25.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.25.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.25.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.25.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.25.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.25.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.25.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.26.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.26.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.26.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.26.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.26.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.26.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.26.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.26.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.27.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.27.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.27.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.27.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.27.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.27.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.27.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.27.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.28.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.28.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.28.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.28.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.28.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.28.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.28.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.28.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.29.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.29.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.29.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.29.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.29.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.29.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.29.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.29.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.30.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.30.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.30.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.30.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.30.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.30.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.30.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.30.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.31.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.31.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.31.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.31.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.31.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.31.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.31.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.31.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.32.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.32.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.32.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.32.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.32.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.32.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.32.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.32.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.33.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.33.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.33.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.33.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.33.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.33.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.33.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.33.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.34.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.34.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.34.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.34.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.34.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.34.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.34.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.34.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.35.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.35.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.35.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.35.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.35.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.35.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.35.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.35.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.36.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.36.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.36.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.36.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.36.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.36.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.36.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.36.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.37.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.37.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.37.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.37.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.37.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.37.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.37.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.37.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.38.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.38.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.38.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.38.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.38.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.38.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.38.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.38.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.39.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.39.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.39.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.39.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.39.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.39.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.39.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.39.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.40.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.40.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.40.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.40.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.40.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.40.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.40.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.40.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.41.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.41.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.41.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.41.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.41.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.41.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.41.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.41.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.42.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.42.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.42.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.42.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.42.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.42.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.42.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.42.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.43.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.43.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.43.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.43.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.43.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.43.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.43.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.43.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.44.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.44.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.44.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.44.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.44.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.44.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.44.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.44.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.45.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.45.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.45.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.45.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.45.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.45.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.45.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.45.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.46.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.46.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.46.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.46.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.46.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.46.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.46.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.46.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.47.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.47.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.47.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.47.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.47.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.47.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.47.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.47.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.48.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.48.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.48.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.48.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.48.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.48.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.48.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.48.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.49.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.49.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.49.mlp.gate_up_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.49.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.49.self_attn.qkv_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.49.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.49.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.49.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.50.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.50.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.50.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.50.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.50.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.50.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.50.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.50.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.51.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.51.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.51.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.51.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.51.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.51.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.51.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.51.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.52.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.52.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.52.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.52.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.52.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.52.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.52.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.52.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.53.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.53.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.53.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.53.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.53.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.53.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.53.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.53.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.54.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.54.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.54.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.54.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.54.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.54.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.54.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.54.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.55.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.55.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.55.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.55.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.55.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.55.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.55.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.55.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.56.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.56.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.56.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.56.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.56.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.56.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.56.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.56.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.57.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.57.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.57.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.57.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.57.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.57.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.57.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.57.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.58.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.58.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.58.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.58.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.58.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.58.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.58.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.58.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.59.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.59.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.59.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.59.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.59.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.59.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.59.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.59.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.60.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.60.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.60.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.60.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.60.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.60.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.60.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.60.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.61.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.61.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.61.mlp.gate_up_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.61.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.61.self_attn.qkv_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.61.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.61.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.61.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.62.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.62.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.62.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.62.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.62.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.62.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.62.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.62.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.63.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.63.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.63.mlp.gate_up_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.63.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.63.self_attn.qkv_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.63.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.63.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.63.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.64.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.64.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.64.mlp.gate_up_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.64.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.64.self_attn.qkv_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.64.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.64.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.64.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.65.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.65.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.65.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.65.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.65.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.65.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.65.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.65.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.66.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.66.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.66.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.66.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.66.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.66.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.66.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.66.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.67.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.67.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.67.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.67.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.67.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.67.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.67.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.67.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.68.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.68.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.68.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.68.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.68.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.68.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.68.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.68.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.69.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.69.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.69.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.69.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.69.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.69.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.69.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.69.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.70.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.70.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.70.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.70.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.70.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.70.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.70.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.70.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.71.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.71.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.71.mlp.gate_up_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.71.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.71.self_attn.qkv_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.71.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.71.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.71.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.72.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.72.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.72.mlp.gate_up_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.72.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.72.self_attn.qkv_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.72.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.72.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.72.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.73.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.73.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.73.mlp.gate_up_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.73.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.73.self_attn.qkv_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.73.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.73.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.73.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.74.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.74.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.74.mlp.gate_up_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.74.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.74.self_attn.qkv_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.74.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.74.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.74.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.75.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.75.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.75.mlp.gate_up_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.75.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.75.self_attn.qkv_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.75.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.75.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.75.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.76.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.76.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.76.mlp.gate_up_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.76.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.76.self_attn.qkv_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.76.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.76.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.76.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.77.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.77.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.77.mlp.gate_up_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.77.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.77.self_attn.qkv_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.77.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.77.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.77.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.78.self_attn.qkv_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.78.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.78.mlp.gate_up_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.78.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.78.self_attn.qkv_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.78.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.78.mlp.gate_up_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.79.self_attn.qkv_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.78.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.79.self_attn.o_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.79.mlp.gate_up_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.79.mlp.down_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.79.self_attn.qkv_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.79.self_attn.o_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.79.mlp.gate_up_proj +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.79.mlp.down_proj +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00007-of-00015.safetensors', 'model-00003-of-00015.safetensors', 'model-00001-of-00015.safetensors', 'model-00014-of-00015.safetensors', 'model-00009-of-00015.safetensors', 'model-00015-of-00015.safetensors', 'model-00010-of-00015.safetensors', 'model-00006-of-00015.safetensors', 'model-00002-of-00015.safetensors', 'model-00012-of-00015.safetensors', 'model-00008-of-00015.safetensors', 'model-00013-of-00015.safetensors', 'model-00005-of-00015.safetensors', 'model-00004-of-00015.safetensors', 'model-00011-of-00015.safetensors']] +(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00005-of-00015.safetensors', 'model-00001-of-00015.safetensors', 'model-00008-of-00015.safetensors', 'model-00009-of-00015.safetensors', 'model-00011-of-00015.safetensors', 'model-00012-of-00015.safetensors', 'model-00007-of-00015.safetensors', 'model-00002-of-00015.safetensors', 'model-00015-of-00015.safetensors', 'model-00010-of-00015.safetensors', 'model-00006-of-00015.safetensors', 'model-00014-of-00015.safetensors', 'model-00013-of-00015.safetensors', 'model-00003-of-00015.safetensors', 'model-00004-of-00015.safetensors']] +(Worker_TP0 pid=443) Loading safetensors checkpoint shards: 0% Completed | 0/15 [00:00 +(Worker_TP1 pid=444) DEBUG 04-22 01:56:08 [compilation/decorators.py:528] Start compiling function +(EngineCore pid=244) DEBUG 04-22 01:56:09 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(APIServer pid=1) DEBUG 04-22 01:56:15 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP0 pid=443) INFO 04-22 01:56:21 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/71aef8fca1/rank_0_0/backbone for vLLM's torch.compile +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=ad833cdb21 comp=e546579c48 code=00c436b0deda272393bbd56b49f2a57f076817aa66601ed815250cff678fbbf0 dir=/data/.cache/vllm/torch_compile_cache/71aef8fca1/rank_0_0/backbone +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] Vllm config hash: ad833cdb21 +(Worker_TP0 pid=443) INFO 04-22 01:56:21 [compilation/backends.py:1111] Dynamo bytecode transform time: 12.42 s +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/.../fusion/allreduce_rms_fusion.py:765] Flashinfer max size: 64 MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: 4096 +(Worker_TP0 pid=443) INFO 04-22 01:56:21 [distributed/device_communicators/flashinfer_all_reduce.py:109] Auto-selected flashinfer allreduce backend: trtllm +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=ad833cdb21 comp=e546579c48 code=00c436b0deda272393bbd56b49f2a57f076817aa66601ed815250cff678fbbf0 dir=/data/.cache/vllm/torch_compile_cache/71aef8fca1/rank_1_0/backbone +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] Vllm config hash: ad833cdb21 +(Worker_TP0 pid=443) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. +(Worker_TP0 pid=443) return func(*args, **kwargs) +(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=1, max_token_num=4096, hidden_dim=8192, dtype=torch.bfloat16 +(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=0, max_token_num=4096, hidden_dim=8192, dtype=torch.bfloat16 +(Worker_TP0 pid=443) INFO 04-22 01:56:21 [distributed/device_communicators/flashinfer_all_reduce.py:149] Initialized FlashInfer Allreduce norm fusion workspace with backend=trtllm +(Worker_TP0 pid=443) DEBUG 04-22 01:56:23 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 4096), (4097, 8192)] +(Worker_TP0 pid=443) DEBUG 04-22 01:56:23 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(Worker_TP0 pid=443) DEBUG 04-22 01:56:24 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=443) DEBUG 04-22 01:56:24 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:56:24 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns +(Worker_TP0 pid=443) DEBUG 04-22 01:56:24 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 23.0 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:56:24 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:56:24 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes +(Worker_TP0 pid=443) DEBUG 04-22 01:56:24 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:56:24 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=444) DEBUG 04-22 01:56:24 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:56:24 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns +(Worker_TP1 pid=444) DEBUG 04-22 01:56:24 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 23.5 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:56:24 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:56:24 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes +(Worker_TP1 pid=444) DEBUG 04-22 01:56:24 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(APIServer pid=1) DEBUG 04-22 01:56:25 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=443) INFO 04-22 01:56:26 [compilation/backends.py:372] Cache the graph of compile range (1, 4096) for later use +(Worker_TP0 pid=443) DEBUG 04-22 01:56:26 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 4096) from inductor_standalone via handle ('artifact_compile_range_1_4096_subgraph_0', '/data/.cache/vllm/torch_compile_cache/71aef8fca1/rank_0_0/backbone/artifact_compile_range_1_4096_subgraph_0') +(Worker_TP0 pid=443) DEBUG 04-22 01:56:26 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=443) DEBUG 04-22 01:56:26 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:56:26 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) +(Worker_TP0 pid=443) DEBUG 04-22 01:56:26 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.7 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:56:26 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=443) DEBUG 04-22 01:56:26 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:56:27 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=444) DEBUG 04-22 01:56:27 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:56:27 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) +(Worker_TP1 pid=444) DEBUG 04-22 01:56:27 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:56:27 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=444) DEBUG 04-22 01:56:27 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=443) INFO 04-22 01:56:27 [compilation/backends.py:372] Cache the graph of compile range (4097, 8192) for later use +(Worker_TP0 pid=443) DEBUG 04-22 01:56:27 [compilation/backends.py:377] Store the 0-th graph for compile range(4097, 8192) from inductor_standalone via handle ('artifact_compile_range_4097_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/71aef8fca1/rank_0_0/backbone/artifact_compile_range_4097_8192_subgraph_0') +(Worker_TP0 pid=443) DEBUG 04-22 01:56:28 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=443) DEBUG 04-22 01:56:28 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:56:28 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP0 pid=443) DEBUG 04-22 01:56:28 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 38.7 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:56:28 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:56:28 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP0 pid=443) DEBUG 04-22 01:56:28 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:56:28 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=444) DEBUG 04-22 01:56:28 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:56:28 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP1 pid=444) DEBUG 04-22 01:56:28 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 37.1 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:56:28 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:56:28 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP1 pid=444) DEBUG 04-22 01:56:28 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:56:29 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 4096) from inductor_standalone via handle ('artifact_compile_range_1_4096_subgraph_1', '/data/.cache/vllm/torch_compile_cache/71aef8fca1/rank_0_0/backbone/artifact_compile_range_1_4096_subgraph_1') +(Worker_TP0 pid=443) DEBUG 04-22 01:56:29 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=443) DEBUG 04-22 01:56:29 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:56:29 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) +(Worker_TP0 pid=443) DEBUG 04-22 01:56:29 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:56:29 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=443) DEBUG 04-22 01:56:29 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:56:29 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=444) DEBUG 04-22 01:56:29 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:56:29 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) +(Worker_TP1 pid=444) DEBUG 04-22 01:56:29 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:56:29 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=444) DEBUG 04-22 01:56:29 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:56:31 [compilation/backends.py:377] Store the 1-th graph for compile range(4097, 8192) from inductor_standalone via handle ('artifact_compile_range_4097_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/71aef8fca1/rank_0_0/backbone/artifact_compile_range_4097_8192_subgraph_1') +(APIServer pid=1) DEBUG 04-22 01:56:35 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=443) DEBUG 04-22 01:56:37 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=443) DEBUG 04-22 01:56:37 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:56:37 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP0 pid=443) DEBUG 04-22 01:56:37 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 38.8 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:56:37 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:56:37 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP0 pid=443) DEBUG 04-22 01:56:37 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:56:38 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=444) DEBUG 04-22 01:56:38 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:56:38 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP1 pid=444) DEBUG 04-22 01:56:38 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 38.1 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:56:38 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:56:38 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP1 pid=444) DEBUG 04-22 01:56:38 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:56:38 [compilation/backends.py:377] Store the 80-th graph for compile range(1, 4096) from inductor_standalone via handle ('artifact_compile_range_1_4096_subgraph_80', '/data/.cache/vllm/torch_compile_cache/71aef8fca1/rank_0_0/backbone/artifact_compile_range_1_4096_subgraph_80') +(Worker_TP0 pid=443) INFO 04-22 01:56:38 [compilation/backends.py:390] Compiling a graph for compile range (1, 4096) takes 11.64 s +(Worker_TP0 pid=443) DEBUG 04-22 01:56:38 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=443) DEBUG 04-22 01:56:38 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:56:38 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) +(Worker_TP0 pid=443) DEBUG 04-22 01:56:38 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:56:38 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=443) DEBUG 04-22 01:56:38 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:56:38 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=444) DEBUG 04-22 01:56:38 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:56:38 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) +(Worker_TP1 pid=444) DEBUG 04-22 01:56:38 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=444) DEBUG 04-22 01:56:38 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=444) DEBUG 04-22 01:56:38 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP0 pid=443) DEBUG 04-22 01:56:39 [compilation/backends.py:377] Store the 80-th graph for compile range(4097, 8192) from inductor_standalone via handle ('artifact_compile_range_4097_8192_subgraph_80', '/data/.cache/vllm/torch_compile_cache/71aef8fca1/rank_0_0/backbone/artifact_compile_range_4097_8192_subgraph_80') +(Worker_TP0 pid=443) INFO 04-22 01:56:39 [compilation/backends.py:390] Compiling a graph for compile range (4097, 8192) takes 12.71 s +(Worker_TP0 pid=443) DEBUG 04-22 01:56:39 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/71aef8fca1/rank_0_0/backbone/computation_graph.py +(Worker_TP0 pid=443) INFO 04-22 01:56:42 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/eac9c02a0de983743bee948426f8d57256d1535569de6e48b0a78e1093f03a5a/rank_0_0/model +(Worker_TP0 pid=443) INFO 04-22 01:56:42 [compilation/monitor.py:48] torch.compile took 33.81 s in total +(Worker_TP0 pid=443) INFO 04-22 01:56:44 [compilation/monitor.py:76] Initial profiling/warmup run took 1.92 s +(APIServer pid=1) DEBUG 04-22 01:56:45 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=443) INFO 04-22 01:56:50 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP0 pid=443) DEBUG 04-22 01:56:50 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP0 pid=443) INFO 04-22 01:56:50 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_TP1 pid=444) INFO 04-22 01:56:50 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP1 pid=444) DEBUG 04-22 01:56:50 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP1 pid=444) INFO 04-22 01:56:50 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_TP0 pid=443) DEBUG 04-22 01:56:50 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=444) DEBUG 04-22 01:56:50 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=443) DEBUG 04-22 01:56:50 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=443) DEBUG 04-22 01:56:50 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=444) DEBUG 04-22 01:56:50 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=444) DEBUG 04-22 01:56:50 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=443) DEBUG 04-22 01:56:51 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=444) DEBUG 04-22 01:56:51 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=443) DEBUG 04-22 01:56:51 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=443) DEBUG 04-22 01:56:51 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=444) DEBUG 04-22 01:56:51 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=444) DEBUG 04-22 01:56:51 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=443) DEBUG 04-22 01:56:51 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 122.00 MiB first-capture + (51-1) × 20.00 MiB per-graph +(Worker_TP0 pid=443) DEBUG 04-22 01:56:51 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=444) DEBUG 04-22 01:56:51 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 122.00 MiB first-capture + (51-1) × 20.00 MiB per-graph +(Worker_TP1 pid=444) DEBUG 04-22 01:56:51 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=443) DEBUG 04-22 01:56:51 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=443) DEBUG 04-22 01:56:51 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=444) DEBUG 04-22 01:56:51 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=444) DEBUG 04-22 01:56:51 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=444) DEBUG 04-22 01:56:51 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=443) DEBUG 04-22 01:56:51 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=444) DEBUG 04-22 01:56:51 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=443) DEBUG 04-22 01:56:51 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=444) DEBUG 04-22 01:56:51 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=443) DEBUG 04-22 01:56:51 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=444) DEBUG 04-22 01:56:51 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 268.00 MiB first-capture + (51-1) × 14.00 MiB per-graph +(Worker_TP1 pid=444) INFO 04-22 01:56:51 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses +(Worker_TP0 pid=443) DEBUG 04-22 01:56:51 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 268.00 MiB first-capture + (51-1) × 14.00 MiB per-graph +(Worker_TP0 pid=443) INFO 04-22 01:56:51 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses +(Worker_TP0 pid=443) DEBUG 04-22 01:56:52 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP0 pid=443) INFO 04-22 01:56:52 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.92 GiB total +(Worker_TP1 pid=444) DEBUG 04-22 01:56:52 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP1 pid=444) INFO 04-22 01:56:52 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.92 GiB total +(Worker_TP0 pid=443) DEBUG 04-22 01:56:52 [v1/worker/gpu_worker.py:424] Initial free memory: 77.66 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP0 pid=443) DEBUG 04-22 01:56:52 [v1/worker/gpu_worker.py:430] Free memory after profiling: 41.35 GiB (total), 38.92 GiB (within requested) +(Worker_TP0 pid=443) DEBUG 04-22 01:56:52 [v1/worker/gpu_worker.py:435] Memory profiling takes 43.50 seconds. Total non KV cache memory: 37.95GiB; torch peak memory increase: 1.96GiB; non-torch forward increase memory: 2.11GiB; weights memory: 33.88GiB. +(Worker_TP0 pid=443) INFO 04-22 01:56:52 [v1/worker/gpu_worker.py:436] Available KV cache memory: 37.28 GiB +(Worker_TP0 pid=443) INFO 04-22 01:56:52 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9743 to maintain the same effective KV cache size. +(Worker_TP0 pid=443) DEBUG 04-22 01:56:52 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=244) DEBUG 04-22 01:56:52 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=244) DEBUG 04-22 01:56:52 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=444) DEBUG 04-22 01:56:52 [v1/worker/gpu_worker.py:424] Initial free memory: 77.66 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP1 pid=444) DEBUG 04-22 01:56:52 [v1/worker/gpu_worker.py:430] Free memory after profiling: 41.35 GiB (total), 38.92 GiB (within requested) +(Worker_TP1 pid=444) DEBUG 04-22 01:56:52 [v1/worker/gpu_worker.py:435] Memory profiling takes 43.59 seconds. Total non KV cache memory: 37.95GiB; torch peak memory increase: 1.96GiB; non-torch forward increase memory: 2.11GiB; weights memory: 33.88GiB. +(Worker_TP1 pid=444) INFO 04-22 01:56:52 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9743 to maintain the same effective KV cache size. +(Worker_TP1 pid=444) DEBUG 04-22 01:56:52 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=244) DEBUG 04-22 01:56:52 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=244) INFO 04-22 01:56:52 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 244,304 tokens +(EngineCore pid=244) INFO 04-22 01:56:52 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 29.82x +(Worker_TP1 pid=444) DEBUG 04-22 01:56:52 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=443) DEBUG 04-22 01:56:52 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=444) INFO 04-22 01:56:52 [v1/worker/gpu_worker.py:578] Compile and warming up model for size 8192 +(Worker_TP0 pid=443) INFO 04-22 01:56:52 [v1/worker/gpu_worker.py:578] Compile and warming up model for size 8192 +(Worker_TP1 pid=444) DEBUG 04-22 01:56:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=443) DEBUG 04-22 01:56:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=443) 2026-04-22 01:56:52,690 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP1 pid=444) 2026-04-22 01:56:52,690 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP0 pid=443) DEBUG 04-22 01:56:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=444) DEBUG 04-22 01:56:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=443) 2026-04-22 01:56:53,303 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP1 pid=444) 2026-04-22 01:56:53,303 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=244) DEBUG 04-22 01:56:53 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=444) DEBUG 04-22 01:56:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=443) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=244) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=244) INFO 04-22 01:57:03 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(Worker_TP0 pid=443) DEBUG 04-22 01:57:03 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=444) DEBUG 04-22 01:57:03 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=244) DEBUG 04-22 01:57:03 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=244) DEBUG 04-22 01:57:03 [v1/engine/core.py:1158] EngineCore waiting for work. +(EngineCore pid=244) DEBUG 04-22 01:57:03 [v1/engine/core.py:1158] EngineCore waiting for work. +(APIServer pid=1) INFO 04-22 01:57:03 [entrypoints/openai/api_server.py:590] Supported tasks: ['generate'] +(APIServer pid=1) WARNING 04-22 01:57:04 [config/model.py:1435] Default vLLM sampling parameters have been overridden by the model's `generation_config.json`: `{'temperature': 0.6, 'top_p': 0.9}`. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`. +(APIServer pid=1) DEBUG 04-22 01:57:04 [renderers/base.py:197] Warming up chat template processing... +(APIServer pid=1) DEBUG 04-22 01:57:04 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e82af0-730e730b2a43419e17ae13cc;6e56a674-0198-405a-8e28-0b9e1442f3df) +(APIServer pid=1) DEBUG 04-22 01:57:04 [transformers_utils/repo_utils.py:243] +(APIServer pid=1) DEBUG 04-22 01:57:04 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/RedHatAI/Llama-3.3-70B-Instruct-quantized.w8a8/resolve/main/processor_config.json. +(APIServer pid=1) DEBUG 04-22 01:57:04 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e82af0-02cf5c644987fd933928fe23;bce00550-b3f4-4c54-90ef-7f32f9f19154) +(APIServer pid=1) DEBUG 04-22 01:57:04 [transformers_utils/repo_utils.py:243] +(APIServer pid=1) DEBUG 04-22 01:57:04 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/RedHatAI/Llama-3.3-70B-Instruct-quantized.w8a8/resolve/main/preprocessor_config.json. +(Worker_TP0 pid=443) DEBUG 04-22 01:57:04 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=444) DEBUG 04-22 01:57:04 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(APIServer pid=1) INFO 04-22 01:57:05 [renderers/hf.py:314] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this. +(APIServer pid=1) DEBUG 04-22 01:57:05 [renderers/base.py:203] Chat template warmup completed in 1.355s +(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/openai/api_server.py:594] Starting vLLM server on http://0.0.0.0:8000 +(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:37] Available routes are: +(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /openapi.json, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /docs, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /docs/oauth2-redirect, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /redoc, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /tokenize, Methods: POST +(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /detokenize, Methods: POST +(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /load, Methods: GET +(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /version, Methods: GET +(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /health, Methods: GET +(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /metrics, Methods: GET +(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /v1/models, Methods: GET +(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /ping, Methods: GET +(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /ping, Methods: POST +(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /invocations, Methods: POST +(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /v1/chat/completions, Methods: POST +(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST +(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /v1/responses, Methods: POST +(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET +(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST +(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /v1/completions, Methods: POST +(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /v1/messages, Methods: POST +(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST +(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /inference/v1/generate, Methods: POST +(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /scale_elastic_ep, Methods: POST +(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST +(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /v1/chat/completions/render, Methods: POST +(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /v1/completions/render, Methods: POST +(APIServer pid=1) INFO: Started server process [1] +(APIServer pid=1) INFO: Waiting for application startup. +(APIServer pid=1) INFO: Application startup complete. +(APIServer pid=1) DEBUG 04-22 01:57:11 [v1/engine/async_llm.py:875] Called check_health. +(APIServer pid=1) INFO: 10.131.2.2:58456 - "GET /health HTTP/1.1" 200 OK diff --git a/accuracy/results/v0.19.0/logs/w4a16-redhatai-lla--h100-80gb--tp1pp1dp1--8192-dtf16.log b/accuracy/results/v0.19.0/logs/w4a16-redhatai-lla--h100-80gb--tp1pp1dp1--8192-dtf16.log new file mode 100644 index 00000000..94e136a0 --- /dev/null +++ b/accuracy/results/v0.19.0/logs/w4a16-redhatai-lla--h100-80gb--tp1pp1dp1--8192-dtf16.log @@ -0,0 +1,745 @@ +DEBUG 04-22 00:10:51 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:10:51 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:10:51 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:10:51 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:10:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:10:51 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:10:51 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:10:51 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:10:51 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:10:51 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:10:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:10:51 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:10:56 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 00:10:58 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' +DEBUG 04-22 00:10:58 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 00:10:58 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:10:58 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:10:58 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 00:10:58 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:10:58 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 00:10:58 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 00:10:58 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16 +(APIServer pid=1) INFO 04-22 00:10:58 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 00:10:58 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:10:58 [entrypoints/utils.py:233] non-default args: {'model_tag': 'RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16', 'model': 'RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16', 'dtype': 'float16', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 00:10:58 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 00:10:58 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 00:10:58 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0004022 secs +(APIServer pid=1) INFO 04-22 00:10:58 [config/model.py:549] Resolved architecture: LlamaForCausalLM +(APIServer pid=1) INFO 04-22 00:10:58 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) INFO 04-22 00:10:59 [model_executor/.../quantization/gptq_marlin.py:229] The model is convertible to gptq_marlin during runtime. Using gptq_marlin kernel. +(APIServer pid=1) DEBUG 04-22 00:10:59 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 00:10:59 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 00:10:59 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 00:10:59 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 00:10:59 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 00:10:59 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 00:10:59 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 00:10:59 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 00:10:59 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:11:00 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:11:00 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 00:11:03 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:11:03 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:11:03 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:11:03 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:11:03 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:11:03 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:11:03 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:11:03 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:11:03 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:11:03 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:11:03 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:11:04 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:11:08 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=244) DEBUG 04-22 00:11:10 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 00:11:10 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=244) DEBUG 04-22 00:11:10 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/429d0cbc-ad35-4f71-b5c3-82da97b23547'], outputs=['ipc:///tmp/1b75cf70-26aa-4bf5-90dd-5602d37f9019'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=244) DEBUG 04-22 00:11:10 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=244) DEBUG 04-22 00:11:10 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=244) DEBUG 04-22 00:11:10 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=244) DEBUG 04-22 00:11:10 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=244) DEBUG 04-22 00:11:10 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=244) INFO 04-22 00:11:10 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16', speculative_config=None, tokenizer='RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=gptq_marlin, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=244) DEBUG 04-22 00:11:10 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.24:37197 backend=nccl +(EngineCore pid=244) INFO 04-22 00:11:10 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.24:37197 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=244) DEBUG 04-22 00:11:10 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=244) INFO 04-22 00:11:10 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=244) DEBUG 04-22 00:11:11 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776816671.2624762, auto_measure=True +(EngineCore pid=244) DEBUG 04-22 00:11:11 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=244) DEBUG 04-22 00:11:11 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=244) DEBUG 04-22 00:11:11 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=244) DEBUG 04-22 00:11:11 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=244) DEBUG 04-22 00:11:11 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=244) DEBUG 04-22 00:11:11 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=244) DEBUG 04-22 00:11:11 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=244) DEBUG 04-22 00:11:11 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=244) INFO 04-22 00:11:11 [v1/worker/gpu_model_runner.py:4735] Starting to load model RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16... +(EngineCore pid=244) INFO 04-22 00:11:11 [model_executor/.../quantization/gptq_marlin.py:376] Using MacheteLinearKernel for GPTQMarlinLinearMethod +(EngineCore pid=244) DEBUG 04-22 00:11:12 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.float16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=244) INFO 04-22 00:11:12 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=244) INFO 04-22 00:11:12 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=244) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=244) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=244) DEBUG 04-22 00:11:12 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=244) DEBUG 04-22 00:11:12 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=244) DEBUG 04-22 00:11:12 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=244) DEBUG 04-22 00:11:12 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=244) DEBUG 04-22 00:11:12 [model_executor/model_loader/weight_utils.py:557] Using model weights format ['*.safetensors'] +(EngineCore pid=244) INFO 04-22 00:11:12 [model_executor/model_loader/weight_utils.py:625] No model.safetensors.index.json found in remote. +(EngineCore pid=244) Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00 +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/mixed_precision/MPLinearKernel.py +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/mixed_precision/machete.py +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/gptq_marlin.py +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(EngineCore pid=244) INFO 04-22 00:11:26 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/1ed2f413cc/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=9913286625 comp=e546579c48 code=c108c187e9321c2995692e3c6585a7f067e86dd4cdaa92de1c48e27c81f442dc dir=/data/.cache/vllm/torch_compile_cache/1ed2f413cc/rank_0_0/backbone +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] Vllm config hash: 9913286625 +(EngineCore pid=244) INFO 04-22 00:11:26 [compilation/backends.py:1111] Dynamo bytecode transform time: 5.44 s +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=244) DEBUG 04-22 00:11:27 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=244) DEBUG 04-22 00:11:27 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(EngineCore pid=244) DEBUG 04-22 00:11:27 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms +(EngineCore pid=244) DEBUG 04-22 00:11:27 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=244) DEBUG 04-22 00:11:27 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=244) INFO 04-22 00:11:29 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=244) DEBUG 04-22 00:11:29 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/1ed2f413cc/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=244) DEBUG 04-22 00:11:30 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=244) DEBUG 04-22 00:11:30 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(EngineCore pid=244) DEBUG 04-22 00:11:30 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.4 ms +(EngineCore pid=244) DEBUG 04-22 00:11:30 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=244) DEBUG 04-22 00:11:30 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(APIServer pid=1) DEBUG 04-22 00:11:30 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=244) DEBUG 04-22 00:11:31 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/1ed2f413cc/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=244) DEBUG 04-22 00:11:32 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=244) DEBUG 04-22 00:11:32 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.0 ms +(EngineCore pid=244) DEBUG 04-22 00:11:32 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.0 ms +(EngineCore pid=244) DEBUG 04-22 00:11:32 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=244) DEBUG 04-22 00:11:32 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(EngineCore pid=244) DEBUG 04-22 00:11:32 [compilation/backends.py:377] Store the 32-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_32', '/data/.cache/vllm/torch_compile_cache/1ed2f413cc/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_32') +(EngineCore pid=244) INFO 04-22 00:11:32 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 6.01 s +(EngineCore pid=244) DEBUG 04-22 00:11:33 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/1ed2f413cc/rank_0_0/backbone/computation_graph.py +(EngineCore pid=244) INFO 04-22 00:11:34 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/b11bf988391eee29fa0da7d22d77941137b65c24097fd54712a4123fb7071154/rank_0_0/model +(EngineCore pid=244) INFO 04-22 00:11:34 [compilation/monitor.py:48] torch.compile took 13.36 s in total +(EngineCore pid=244) INFO 04-22 00:11:34 [compilation/monitor.py:76] Initial profiling/warmup run took 0.40 s +(APIServer pid=1) DEBUG 04-22 00:11:40 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=244) INFO 04-22 00:11:40 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=244) DEBUG 04-22 00:11:40 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=244) INFO 04-22 00:11:40 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=244) DEBUG 04-22 00:11:40 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:11:40 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:11:40 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-22 00:11:40 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:11:40 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:11:40 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-22 00:11:40 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 126.00 MiB first-capture + (51-1) × 8.00 MiB per-graph +(EngineCore pid=244) DEBUG 04-22 00:11:40 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:11:40 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:11:40 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-22 00:11:40 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:11:40 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:11:40 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-22 00:11:40 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 262.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(EngineCore pid=244) DEBUG 04-22 00:11:41 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=244) INFO 04-22 00:11:41 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.84 GiB total +(EngineCore pid=244) DEBUG 04-22 00:11:41 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=244) DEBUG 04-22 00:11:41 [v1/worker/gpu_worker.py:430] Free memory after profiling: 72.84 GiB (total), 69.4 GiB (within requested) +(EngineCore pid=244) DEBUG 04-22 00:11:41 [v1/worker/gpu_worker.py:435] Memory profiling takes 20.61 seconds. Total non KV cache memory: 7.51GiB; torch peak memory increase: 1.89GiB; non-torch forward increase memory: 0.25GiB; weights memory: 5.38GiB. +(EngineCore pid=244) INFO 04-22 00:11:41 [v1/worker/gpu_worker.py:436] Available KV cache memory: 67.71 GiB +(EngineCore pid=244) INFO 04-22 00:11:41 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9606 to maintain the same effective KV cache size. +(EngineCore pid=244) INFO 04-22 00:11:41 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 554,704 tokens +(EngineCore pid=244) INFO 04-22 00:11:41 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 67.71x +(EngineCore pid=244) 2026-04-22 00:11:41,843 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=244) DEBUG 04-22 00:11:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) 2026-04-22 00:11:41,854 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=244) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:09:44 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:09:44 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 00:09:44 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:09:44 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 00:09:44 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 00:09:44 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8 +(APIServer pid=1) INFO 04-22 00:09:44 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 00:09:44 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:09:44 [entrypoints/utils.py:233] non-default args: {'model_tag': 'RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8', 'model': 'RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8', 'dtype': 'float16', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 00:09:44 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 00:09:45 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 00:09:45 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003835 secs +(APIServer pid=1) INFO 04-22 00:09:45 [config/model.py:549] Resolved architecture: LlamaForCausalLM +(APIServer pid=1) WARNING 04-22 00:09:45 [config/model.py:2016] Casting torch.bfloat16 to torch.float16. +(APIServer pid=1) INFO 04-22 00:09:45 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 00:09:45 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 00:09:45 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 00:09:45 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 00:09:45 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 00:09:45 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 00:09:45 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 00:09:45 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 00:09:45 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 00:09:45 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:09:46 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:09:46 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 00:09:49 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:09:49 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:09:49 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:09:49 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:09:49 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:09:49 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:09:49 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:09:49 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:09:49 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:09:49 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:09:49 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:09:49 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:09:54 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=243) DEBUG 04-22 00:09:56 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 00:09:56 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=243) DEBUG 04-22 00:09:56 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/1f7316b6-80e0-4336-88b3-183484609596'], outputs=['ipc:///tmp/c432d9a3-9551-4a4e-816f-13a218d6bd75'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=243) DEBUG 04-22 00:09:56 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=243) DEBUG 04-22 00:09:56 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=243) DEBUG 04-22 00:09:56 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=243) DEBUG 04-22 00:09:56 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=243) DEBUG 04-22 00:09:56 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=243) INFO 04-22 00:09:56 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8', speculative_config=None, tokenizer='RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=243) DEBUG 04-22 00:09:56 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.4.188:32907 backend=nccl +(EngineCore pid=243) INFO 04-22 00:09:56 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.4.188:32907 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=243) DEBUG 04-22 00:09:56 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=243) INFO 04-22 00:09:56 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=243) DEBUG 04-22 00:09:57 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776816597.0954285, auto_measure=True +(EngineCore pid=243) DEBUG 04-22 00:09:57 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=243) DEBUG 04-22 00:09:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=243) DEBUG 04-22 00:09:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=243) DEBUG 04-22 00:09:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=243) DEBUG 04-22 00:09:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=243) DEBUG 04-22 00:09:57 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=243) DEBUG 04-22 00:09:57 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=243) INFO 04-22 00:09:57 [v1/worker/gpu_model_runner.py:4735] Starting to load model RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8... +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.0.self_attn.qkv_proj +(EngineCore pid=243) INFO 04-22 00:09:57 [model_executor/.../linear/__init__.py:291] Selected CutlassInt8ScaledMMLinearKernel for CompressedTensorsW8A8Int8 +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.0.self_attn.o_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.float16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=243) INFO 04-22 00:09:57 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=243) INFO 04-22 00:09:57 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.0.mlp.gate_up_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.0.mlp.down_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.1.self_attn.qkv_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.1.self_attn.o_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.1.mlp.gate_up_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.1.mlp.down_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.2.self_attn.qkv_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.2.self_attn.o_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.2.mlp.gate_up_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.2.mlp.down_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.3.self_attn.qkv_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.3.self_attn.o_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.3.mlp.gate_up_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.3.mlp.down_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.4.self_attn.qkv_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.4.self_attn.o_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.4.mlp.gate_up_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.4.mlp.down_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.5.self_attn.qkv_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.5.self_attn.o_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.5.mlp.gate_up_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.5.mlp.down_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.6.self_attn.qkv_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.6.self_attn.o_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.6.mlp.gate_up_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.6.mlp.down_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.7.self_attn.qkv_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.7.self_attn.o_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.7.mlp.gate_up_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.7.mlp.down_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.8.self_attn.qkv_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.8.self_attn.o_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.8.mlp.gate_up_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.8.mlp.down_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.9.self_attn.qkv_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.9.self_attn.o_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.9.mlp.gate_up_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.9.mlp.down_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.10.self_attn.qkv_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.10.self_attn.o_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.10.mlp.gate_up_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.10.mlp.down_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.11.self_attn.qkv_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.11.self_attn.o_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.11.mlp.gate_up_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.11.mlp.down_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.12.self_attn.qkv_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.12.self_attn.o_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.12.mlp.gate_up_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.12.mlp.down_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.13.self_attn.qkv_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.13.self_attn.o_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.13.mlp.gate_up_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.13.mlp.down_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.14.self_attn.qkv_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.14.self_attn.o_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.14.mlp.gate_up_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.14.mlp.down_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.15.self_attn.qkv_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.15.self_attn.o_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.15.mlp.gate_up_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.15.mlp.down_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.16.self_attn.qkv_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.16.self_attn.o_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.16.mlp.gate_up_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.16.mlp.down_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.17.self_attn.qkv_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.17.self_attn.o_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.17.mlp.gate_up_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.17.mlp.down_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.18.self_attn.qkv_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.18.self_attn.o_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.18.mlp.gate_up_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.18.mlp.down_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.19.self_attn.qkv_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.19.self_attn.o_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.19.mlp.gate_up_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.19.mlp.down_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.20.self_attn.qkv_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.20.self_attn.o_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.20.mlp.gate_up_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.20.mlp.down_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.21.self_attn.qkv_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.21.self_attn.o_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.21.mlp.gate_up_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.21.mlp.down_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.22.self_attn.qkv_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.22.self_attn.o_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.22.mlp.gate_up_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.22.mlp.down_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.23.self_attn.qkv_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.23.self_attn.o_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.23.mlp.gate_up_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.23.mlp.down_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.24.self_attn.qkv_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.24.self_attn.o_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.24.mlp.gate_up_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.24.mlp.down_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.25.self_attn.qkv_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.25.self_attn.o_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.25.mlp.gate_up_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.25.mlp.down_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.26.self_attn.qkv_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.26.self_attn.o_proj +(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.26.mlp.gate_up_proj +(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.26.mlp.down_proj +(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.27.self_attn.qkv_proj +(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.27.self_attn.o_proj +(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.27.mlp.gate_up_proj +(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.27.mlp.down_proj +(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.28.self_attn.qkv_proj +(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.28.self_attn.o_proj +(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.28.mlp.gate_up_proj +(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.28.mlp.down_proj +(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.29.self_attn.qkv_proj +(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.29.self_attn.o_proj +(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.29.mlp.gate_up_proj +(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.29.mlp.down_proj +(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.30.self_attn.qkv_proj +(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.30.self_attn.o_proj +(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.30.mlp.gate_up_proj +(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.30.mlp.down_proj +(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.31.self_attn.qkv_proj +(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.31.self_attn.o_proj +(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.31.mlp.gate_up_proj +(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.31.mlp.down_proj +(EngineCore pid=243) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=243) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=243) DEBUG 04-22 00:09:58 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=243) DEBUG 04-22 00:09:58 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=243) DEBUG 04-22 00:09:58 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00001-of-00002.safetensors', 'model-00002-of-00002.safetensors']] +(EngineCore pid=243) Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00 +(APIServer pid=1) DEBUG 04-22 00:10:06 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=243) INFO 04-22 00:10:11 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/89d9e052d6/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=0a3183d67c comp=e546579c48 code=00c436b0deda272393bbd56b49f2a57f076817aa66601ed815250cff678fbbf0 dir=/data/.cache/vllm/torch_compile_cache/89d9e052d6/rank_0_0/backbone +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] Vllm config hash: 0a3183d67c +(EngineCore pid=243) INFO 04-22 00:10:11 [compilation/backends.py:1111] Dynamo bytecode transform time: 5.62 s +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=243) DEBUG 04-22 00:10:12 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 00:10:12 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(EngineCore pid=243) DEBUG 04-22 00:10:12 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.0 ms +(EngineCore pid=243) DEBUG 04-22 00:10:12 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 00:10:12 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=243) INFO 04-22 00:10:14 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=243) DEBUG 04-22 00:10:14 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/89d9e052d6/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=243) DEBUG 04-22 00:10:15 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 00:10:15 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(EngineCore pid=243) DEBUG 04-22 00:10:15 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.6 ms +(EngineCore pid=243) DEBUG 04-22 00:10:15 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 00:10:15 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(APIServer pid=1) DEBUG 04-22 00:10:16 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=243) DEBUG 04-22 00:10:16 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/89d9e052d6/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=243) DEBUG 04-22 00:10:17 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 00:10:17 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.2 ms +(EngineCore pid=243) DEBUG 04-22 00:10:17 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.3 ms +(EngineCore pid=243) DEBUG 04-22 00:10:17 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 00:10:17 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(EngineCore pid=243) DEBUG 04-22 00:10:18 [compilation/backends.py:377] Store the 32-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_32', '/data/.cache/vllm/torch_compile_cache/89d9e052d6/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_32') +(EngineCore pid=243) INFO 04-22 00:10:18 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 6.40 s +(EngineCore pid=243) DEBUG 04-22 00:10:18 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/89d9e052d6/rank_0_0/backbone/computation_graph.py +(EngineCore pid=243) INFO 04-22 00:10:19 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/0d0d81f25d4664a542cae9b3d62552b300cbbc433ae2ccda7e244ce4a0a6c9d8/rank_0_0/model +(EngineCore pid=243) INFO 04-22 00:10:19 [compilation/monitor.py:48] torch.compile took 13.43 s in total +(EngineCore pid=243) INFO 04-22 00:10:19 [compilation/monitor.py:76] Initial profiling/warmup run took 0.33 s +(EngineCore pid=243) INFO 04-22 00:10:25 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=243) DEBUG 04-22 00:10:25 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=243) INFO 04-22 00:10:25 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=243) DEBUG 04-22 00:10:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:10:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:10:25 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 00:10:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:10:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:10:25 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 00:10:25 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 112.00 MiB first-capture + (51-1) × 8.00 MiB per-graph +(EngineCore pid=243) DEBUG 04-22 00:10:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:10:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:10:25 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 00:10:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:10:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:10:25 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 00:10:25 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 262.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(APIServer pid=1) DEBUG 04-22 00:10:26 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=243) DEBUG 04-22 00:10:26 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=243) INFO 04-22 00:10:26 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.84 GiB total +(EngineCore pid=243) DEBUG 04-22 00:10:26 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=243) DEBUG 04-22 00:10:26 [v1/worker/gpu_worker.py:430] Free memory after profiling: 69.73 GiB (total), 66.28 GiB (within requested) +(EngineCore pid=243) DEBUG 04-22 00:10:26 [v1/worker/gpu_worker.py:435] Memory profiling takes 20.90 seconds. Total non KV cache memory: 10.63GiB; torch peak memory increase: 1.89GiB; non-torch forward increase memory: 0.25GiB; weights memory: 8.49GiB. +(EngineCore pid=243) INFO 04-22 00:10:26 [v1/worker/gpu_worker.py:436] Available KV cache memory: 64.6 GiB +(EngineCore pid=243) INFO 04-22 00:10:26 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9606 to maintain the same effective KV cache size. +(EngineCore pid=243) INFO 04-22 00:10:26 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 529,200 tokens +(EngineCore pid=243) INFO 04-22 00:10:26 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 64.60x +(EngineCore pid=243) 2026-04-22 00:10:26,778 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=243) DEBUG 04-22 00:10:26 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) 2026-04-22 00:10:26,789 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=243) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00] +Analyze capacity planner predictions vs actual vLLM measurements. +Generates a markdown report with error statistics per memory component. """ -import argparse + import csv -import json -import re -import sys +import math +import statistics from pathlib import Path -from typing import Any - -COMPONENTS = ["weight_memory", "activation_memory", "non_torch_memory", "kv_cache"] -_MKEYS = { - "weight_memory": "weight_memory_gib", - "activation_memory": "activation_memory_gib", - "non_torch_memory": "non_torch_memory_gib", - "kv_cache": "kv_cache_gib", -} -# Known GPU memory sizes (GiB). _gpu_memory_gib() falls back to regex parsing. -_GPU_MEMORY_GIB: dict[str, int] = { - "H100-80GB": 80, "H100-40GB": 40, - "A100-80GB": 80, "A100-40GB": 40, - "L40S": 48, "L4": 24, "A10G": 24, "A10": 24, - "V100-32GB": 32, "V100-16GB": 16, -} +REPO = Path(__file__).parent.parent +RAW_CSV = REPO / "results/v0.19.0/results_raw.csv" +PRED_CSV = REPO / "results/v0.19.0/results_predicted.csv" +OUT_MD = REPO / "results/v0.19.0/accuracy_report.md" -def _gpu_memory_gib(gpu_name: str) -> int: - if gpu_name in _GPU_MEMORY_GIB: - return _GPU_MEMORY_GIB[gpu_name] - m = re.search(r"(\d+)\s*GB", gpu_name, re.IGNORECASE) - if m: - return int(m.group(1)) - raise ValueError(f"Cannot determine GPU memory for: {gpu_name!r}") - - -def compute_planner_predictions(run: dict[str, Any], hf_token: str | None = None) -> dict[str, float]: - """Call the capacity planner for the given run's model + vllm_args.""" - from planner.capacity_planner import ( - allocatable_kv_cache_memory, - estimate_vllm_activation_memory, - estimate_vllm_cuda_graph_memory, - estimate_vllm_non_torch_memory, - get_model_config_from_hf, - per_gpu_model_memory_required, - ) +# ── Helpers ─────────────────────────────────────────────────────────────────── - model_name: str = run["model"] - va: dict = run.get("vllm_args", run) - tp = int(va.get("tensor_parallel_size", run.get("tp", 1))) - pp = int(va.get("pipeline_parallel_size", run.get("pp", 1))) - dp = int(va.get("data_parallel_size", run.get("dp", 1))) - max_model_len = int(va.get("max_model_len", run.get("max_model_len", 8192))) - gpu_util = float(va.get("gpu_memory_utilization", 0.9)) - gpu_memory = _gpu_memory_gib(run["gpu"]) - - model_config = get_model_config_from_hf(model_name, hf_token) - weight = per_gpu_model_memory_required(model_name, model_config, tp, pp, hf_token) - kv = allocatable_kv_cache_memory( - model_name, model_config, gpu_memory, gpu_util, - tp=tp, pp=pp, dp=dp, max_model_len=max_model_len, hf_token=hf_token, - ) - activation = estimate_vllm_activation_memory(model_config, tp=tp) - non_torch = estimate_vllm_non_torch_memory(tp) - cuda_graph = estimate_vllm_cuda_graph_memory() +def pct_error(actual: float, predicted: float) -> float: + """(predicted - actual) / actual * 100. Positive = over-estimate.""" + if actual == 0: + return float("nan") + return (predicted - actual) / actual * 100.0 - # allocatable_kv_cache_memory() returns total KV across all (tp×pp) GPUs. - # vLLM logs "Available KV cache memory" per GPU, so divide to match. - kv_per_gpu = kv / (tp * pp) +def fmt(v: float, decimals: int = 2) -> str: + if math.isnan(v): + return "n/a" + sign = "+" if v >= 0 else "" + return f"{sign}{v:.{decimals}f}%" + + +def stats(values: list[float]) -> dict: + vals = [v for v in values if not math.isnan(v)] + if not vals: + return {"mean": float("nan"), "median": float("nan"), + "min": float("nan"), "max": float("nan"), + "abs_mean": float("nan"), "n": 0} return { - "weight_memory_gib": round(weight, 2), - "activation_memory_gib": round(activation, 2), - "non_torch_memory_gib": round(non_torch, 2), - "kv_cache_gib": round(kv_per_gpu, 2), - "kv_cache_total_gib": round(kv, 2), - "cuda_graph_memory_gib": round(cuda_graph, 2), + "mean": statistics.mean(vals), + "median": statistics.median(vals), + "min": min(vals), + "max": max(vals), + "abs_mean": statistics.mean(abs(v) for v in vals), + "n": len(vals), } -def _normalize_run(run: dict[str, Any], hf_token: str | None = None) -> dict[str, Any]: - """Convert flat vLLM-log format to analyzed format; no-op for pre-analyzed format.""" - if "measured" in run or "planner_predicted" in run: - return run - - va: dict = run.get("vllm_args", {}) - normalized: dict[str, Any] = { - "model": run["model"], - "gpu": run.get("gpu", "unknown"), - "tp": int(va.get("tensor_parallel_size", 1)), - "pp": int(va.get("pipeline_parallel_size", 1)), - "dp": int(va.get("data_parallel_size", 1)), - "max_model_len": int(va.get("max_model_len", 8192)), - "vllm_args": va, - } - if "_sweep_dim" in run: - normalized["_sweep_dim"] = run["_sweep_dim"] - - # activation_memory and non_torch_memory are not directly logged by vLLM - normalized["measured"] = { - "weight_memory_gib": run.get("weight_memory_gib"), - "kv_cache_gib": run.get("kv_cache_memory_gib"), - "activation_memory_gib": None, - "non_torch_memory_gib": None, - } +def stats_row(label: str, errors: list[float]) -> str: + s = stats(errors) + if s["n"] == 0: + return f"| {label} | — | — | — | — | — | — |" + return ( + f"| {label} | {fmt(s['mean'])} | {fmt(s['median'])} | " + f"{fmt(s['abs_mean'])} | {fmt(s['min'])} | {fmt(s['max'])} | {s['n']} |" + ) - try: - normalized["planner_predicted"] = compute_planner_predictions(run, hf_token) - except Exception as exc: - print( - f"Warning: planner prediction failed for {run['model']}: {exc}", - file=sys.stderr, - ) - - return normalized - - -def compute_error_pct(run: dict[str, Any]) -> dict[str, float | None]: - result: dict[str, float | None] = {} - for c in COMPONENTS: - key = _MKEYS[c] - measured = run["measured"].get(key) - predicted = run["planner_predicted"].get(key) - if measured is not None and predicted is not None and measured != 0: - result[c] = (predicted - measured) / measured * 100 - else: - result[c] = None - return result - - -def load_runs(directory: str | Path, hf_token: str | None = None) -> list[dict[str, Any]]: - runs = [] - for p in sorted(Path(directory).glob("*.json")): - data = json.loads(p.read_text()) - if data.get("skipped"): - continue - data = _normalize_run(data, hf_token) - if "error_pct" not in data and "planner_predicted" in data: - data["error_pct"] = compute_error_pct(data) - runs.append(data) - return runs - - -def find_outliers(runs: list[dict[str, Any]], threshold_pct: float = 10.0) -> list[dict[str, Any]]: - return [ - r for r in runs - if any( - v is not None and abs(v) > threshold_pct - for v in r.get("error_pct", {}).values() - ) - ] - - -def _fmt(v: float | None) -> str: - if v is None: - return "—" - return f"{'+'if v>0 else ''}{v:.1f}%" - - -def generate_markdown_report(runs: list[dict[str, Any]]) -> str: - lines = ["# Memory Validation Report\n"] - - lines += ["## Per-component error\n", - "| Model | TP | PP | DP | max_len | Weight | Activation | Non-torch | KV cache |", - "|---|---|---|---|---|---|---|---|---|"] - for r in runs: - e = r.get("error_pct", {}) - lines.append( - f"| {r['model']} | {r['tp']} | {r['pp']} | {r['dp']} | {r['max_model_len']} " - f"| {_fmt(e.get('weight_memory'))} | {_fmt(e.get('activation_memory'))} " - f"| {_fmt(e.get('non_torch_memory'))} | {_fmt(e.get('kv_cache'))} |" - ) - lines.append("") - - lines += ["## Per-architecture error\n", - "_Group by architecture class. Mean and max absolute error per component._\n"] - - lines.append("## Argument sensitivity\n") - - def _sweep_val(r: dict, dim: str) -> Any: - if dim in r: - return r[dim] - return r.get("vllm_args", {}).get(dim, "?") - - for sweep_dim in ("max_model_len", "tp", "pp", "dp", "dtype", "quantization", "kv_cache_dtype"): - sweep_runs = [r for r in runs if r.get("_sweep_dim") == sweep_dim] - if sweep_runs: - lines += [f"### {sweep_dim} sweep\n", - "| Value | Weight | Activation | Non-torch | KV cache |", - "|---|---|---|---|---|"] - for r in sweep_runs: - e = r.get("error_pct", {}) - lines.append( - f"| {_sweep_val(r, sweep_dim)} " - f"| {_fmt(e.get('weight_memory'))} | {_fmt(e.get('activation_memory'))} " - f"| {_fmt(e.get('non_torch_memory'))} | {_fmt(e.get('kv_cache'))} |" - ) - lines.append("") - - lines.append("## Outliers\n") - outliers = find_outliers(runs) - if outliers: - for r in outliers: - bad = {k: v for k, v in r.get("error_pct", {}).items() if v is not None and abs(v) > 10} - lines.append(f"- **{r['model']}** (TP={r['tp']}): {bad} — root cause required") - else: - lines.append("_No outliers (all components within ±10%)._") - lines.append("") - - lines += ["## Calibration decisions\n", - "_Document constant changes here: old value → new value, evidence._\n"] - return "\n".join(lines) - - -def main() -> None: - ap = argparse.ArgumentParser() - ap.add_argument("--runs", required=True) - ap.add_argument("--out", required=True) - ap.add_argument("--csv", default=None) - ap.add_argument("--hf-token", default=None, - help="HuggingFace API token (needed for gated models)") - args = ap.parse_args() - - runs = load_runs(args.runs, hf_token=args.hf_token) - Path(args.out).parent.mkdir(parents=True, exist_ok=True) - Path(args.out).write_text(generate_markdown_report(runs)) - print(f"Report written to {args.out} ({len(runs)} runs)") - - if args.csv: - with open(args.csv, "w", newline="") as f: - w = csv.DictWriter(f, fieldnames=[ - "model", "gpu", "tp", "pp", "dp", "max_model_len", - "dtype", "quantization", "kv_cache_dtype", - "weight_error_pct", "activation_error_pct", - "non_torch_error_pct", "kv_cache_error_pct"]) - w.writeheader() - for r in runs: - e = r.get("error_pct", {}) - va = r.get("vllm_args", r) - w.writerow({ - "model": r["model"], "gpu": r["gpu"], - "tp": va.get("tensor_parallel_size", r.get("tp")), - "pp": va.get("pipeline_parallel_size", r.get("pp")), - "dp": va.get("data_parallel_size", r.get("dp")), - "max_model_len": va.get("max_model_len", r.get("max_model_len")), - "dtype": va.get("dtype", "auto"), - "quantization": va.get("quantization"), - "kv_cache_dtype": va.get("kv_cache_dtype", "auto"), - "weight_error_pct": e.get("weight_memory"), - "activation_error_pct": e.get("activation_memory"), - "non_torch_error_pct": e.get("non_torch_memory"), - "kv_cache_error_pct": e.get("kv_cache"), - }) - - -if __name__ == "__main__": - main() + +def fv(v: float, d: int = 2) -> str: + return f"{v:.{d}f}" if not math.isnan(v) else "n/a" + + +# ── Data Loading ────────────────────────────────────────────────────────────── + +raw_ok = [r for r in csv.DictReader(RAW_CSV.open()) if r["status"] == "ok"] +pred_all = list(csv.DictReader(PRED_CSV.open())) +assert len(raw_ok) == len(pred_all) +pairs = list(zip(raw_ok, pred_all)) + +# ── Per-row error calculation ───────────────────────────────────────────────── + +COMPONENTS = { + "weight": ("weight_memory_gib", "pred_weight_memory_gib"), + "activation": ("activation_memory_gib", "pred_activation_memory_gib"), + "non_torch": ("non_torch_forward_gib", "pred_non_torch_gib"), + "cuda_graph": ("cuda_graph_actual_gib", "pred_cuda_graph_gib"), + "total_non_kv": ("total_non_kv_cache_gib", "pred_total_non_kv_cache_gib"), + "kv_cache": ("kv_cache_memory_gib", "pred_kv_cache_memory_gib"), + "kv_tokens": ("kv_cache_tokens", "pred_kv_cache_tokens"), + "max_concurrency": ("max_concurrency", "pred_max_concurrency"), +} + +rows_data = [] +for raw, pred in pairs: + entry = { + "log_file": raw["log_file"], + "model": raw["model"], + "architecture": pred["architecture"], + "gpu": raw["gpu"], + "tp": int(raw["tp"]), + "pp": int(raw["pp"]), + "dp": int(raw["dp"]), + "max_model_len": int(raw["max_model_len"]), + "quantization": raw["quantization"], + "kv_cache_dtype":raw["kv_cache_dtype"], + "dtype": raw["dtype"], + } + for key, (rcol, pcol) in COMPONENTS.items(): + try: + a = float(raw.get(rcol, "")) + p = float(pred.get(pcol, "")) + except (ValueError, TypeError): + a = p = float("nan") + entry[f"actual_{key}"] = a + entry[f"pred_{key}"] = p + entry[f"err_{key}"] = pct_error(a, p) + rows_data.append(entry) + + +# ── Segment helpers ─────────────────────────────────────────────────────────── + +def where(fn): + return [r for r in rows_data if fn(r)] + +base = where(lambda r: r["tp"] == 1 and r["pp"] == 1 and r["max_model_len"] == 8192 + and r["quantization"] in ("None", "", None) and r["kv_cache_dtype"] != "fp8") +multi = where(lambda r: r["tp"] > 1 or r["pp"] > 1) +quant = where(lambda r: r["quantization"] not in ("None", "", None)) +kvfp8 = where(lambda r: r["kv_cache_dtype"] == "fp8") + + +# ── Report builder ──────────────────────────────────────────────────────────── + +lines = [] +W = lines.append + + +def section(title: str, rows: list[dict]): + W(f"\n### {title} (n={len(rows)})\n") + W("| Component | Mean error | Median | Mean abs | Min | Max | n |") + W("|-----------|:----------:|:------:|:--------:|:---:|:---:|:-:|") + for key in ["weight", "activation", "non_torch", "cuda_graph", + "total_non_kv", "kv_cache", "max_concurrency"]: + W(stats_row(key.replace("_", " ").title(), [r[f"err_{key}"] for r in rows])) + + +# ═══════════════════════════════════════════════════════════════════════════════ +W("# Capacity Planner Accuracy Report — vLLM v0.19.0 / H100-80GB") +W("") +W(f"**Dataset**: {len(rows_data)} successful runs across " + f"{len(set(r['model'] for r in rows_data))} unique models ") +W("**Hardware**: H100-80GB (catalog memory = 80 GiB, actual = ~79.19 GiB) ") +W("**Planner GPU util**: actual `gpu_memory_utilization` per run (0.95) ") +W("") + +# ── Executive Summary ───────────────────────────────────────────────────────── +W("## Executive Summary\n") + +kv_errs_all = [r["err_kv_cache"] for r in rows_data if not math.isnan(r["err_kv_cache"])] +kv_errs_base = [r["err_kv_cache"] for r in base if not math.isnan(r["err_kv_cache"])] +act_errs = [r["err_activation"] for r in rows_data if not math.isnan(r["err_activation"])] +wt_errs = [r["err_weight"] for r in rows_data if not math.isnan(r["err_weight"])] +nt_errs = [r["err_non_torch"] for r in rows_data if not math.isnan(r["err_non_torch"])] +conc_errs = [r["err_max_concurrency"] for r in rows_data if not math.isnan(r["err_max_concurrency"])] + +kv_mean_all = statistics.mean(kv_errs_all) +kv_abs_all = statistics.mean(abs(e) for e in kv_errs_all) +kv_mean_base = statistics.mean(kv_errs_base) +act_mean = statistics.mean(act_errs) +act_abs = statistics.mean(abs(e) for e in act_errs) +wt_mean = statistics.mean(wt_errs) +wt_abs = statistics.mean(abs(e) for e in wt_errs) +conc_mean = statistics.mean(conc_errs) +conc_abs = statistics.mean(abs(e) for e in conc_errs) + +W("| Metric | Mean error | Mean abs error | Notes |") +W("|--------|:----------:|:--------------:|-------|") +W(f"| **KV Cache memory** (all 47 runs) | {fmt(kv_mean_all)} | {fmt(kv_abs_all)} | |") +W(f"| **KV Cache memory** (baseline: tp=pp=1, len=8192, no-quant) | {fmt(kv_mean_base)} | — | n={len(kv_errs_base)} |") +W(f"| **Weight memory** | {fmt(wt_mean)} | {fmt(wt_abs)} | From safetensors metadata |") +W(f"| **Activation memory** | {fmt(act_mean)} | {fmt(act_abs)} | Largest error source |") +W(f"| **Non-torch overhead** | {fmt(statistics.mean(nt_errs))} | {fmt(statistics.mean(abs(e) for e in nt_errs))} | |") +W(f"| **Max concurrency** | {fmt(conc_mean)} | {fmt(conc_abs)} | Proxy for KV cache accuracy |") +W("") +W("### Key Findings\n") +W(f"1. **Weights are accurate** — mean abs error {fmt(wt_abs)}, computed directly from " + "safetensors parameter counts. Errors arise only when `--dtype` overrides the native " + "dtype (e.g., `--dtype float32`) or when quantization is not fully captured in the config.") +W(f"2. **Activation is the dominant error source** — mean {fmt(act_mean)} (over-estimate). " + "The planner uses empirical constants (4.8–8.0 GiB) measured at `max_model_len=16000`; " + "vLLM v0.19.0 reports 0.75–2.2 GiB across all architectures tested. Granite is worst (+600%), " + "Mistral3/Pixtral is best (+15–23%).") +W("3. **Over-estimated activation partially cancels** the catalog GPU memory inflation (+0.77 GiB), " + f"leaving KV cache only {fmt(kv_mean_all)} off on average across all runs. But this is " + "coincidental cancellation of two large opposing errors, not model accuracy.") +W("4. **Non-default KV dtype (`--kv-cache-dtype fp8`) doubles token capacity** but the planner " + "ignores this flag — KV token count is off by ~2× for those runs.") +W("5. **`--dtype float32` breaks weight prediction** — the planner uses the HuggingFace " + "config dtype (BF16) and never sees the vLLM `--dtype` override, giving −50% weight error.") +W("6. **Pipeline parallelism reduces actual activation** (each GPU processes fewer layers) " + "but the formula uses the same constant regardless of PP, compounding the activation error.") +W("") + +# ── Component Error Tables ──────────────────────────────────────────────────── +W("## Component-Level Error Breakdown\n") +W("> Percent error = (predicted − actual) / actual × 100. " + "Positive = over-estimate, negative = under-estimate.\n") + +section("All 47 Runs", rows_data) +section("Baseline: TP=1, PP=1, len=8192, no quantization, default KV dtype", base) +section("Multi-GPU (TP > 1 or PP > 1)", multi) +section("Quantized Models (fp8-dynamic / w8a8 / w4a16)", quant) +section("Non-default KV cache dtype (--kv-cache-dtype fp8)", kvfp8) + +# ── Per-Model Error Table ───────────────────────────────────────────────────── +W("\n## Per-Model Errors — Baseline Runs\n") +W("> TP=1, PP=1, max_model_len=8192, no quantization, default KV dtype.\n") +W("| Model | Arch | Weight err | Activation err | Non-torch err | KV cache err | Max conc err |") +W("|-------|------|:----------:|:--------------:|:-------------:|:------------:|:------------:|") +for r in sorted(base, key=lambda x: x["model"]): + model_short = r["model"].split("/")[-1][:35] + arch_short = (r["architecture"] + .replace("ForCausalLM", "") + .replace("ForConditionalGeneration", "*"))[:25] + W(f"| {model_short} | {arch_short} | " + f"{fmt(r['err_weight'])} | {fmt(r['err_activation'])} | " + f"{fmt(r['err_non_torch'])} | {fmt(r['err_kv_cache'])} | " + f"{fmt(r['err_max_concurrency'])} |") + +# ═══════════════════════════════════════════════════════════════════════════════ +W("\n## Argument Sensitivity Analysis\n") +W("> This section examines how each vLLM launch argument affects whether the " + "capacity planner's memory predictions remain accurate.\n") + +# ── max_model_len ───────────────────────────────────────────────────────────── +W("### `--max-model-len` (context window size)\n") + +llama_len = where(lambda r: "Llama-3.1-8B-Instruct" in r["model"] + and r["tp"] == 1 and r["pp"] == 1 + and r["quantization"] in ("None", "", None) + and r["kv_cache_dtype"] != "fp8") +llama_len.sort(key=lambda r: r["max_model_len"]) + +qwen_len = where(lambda r: r["model"] == "Qwen/Qwen2.5-7B-Instruct" + and r["tp"] == 1 and r["pp"] == 1 + and r["quantization"] in ("None", "", None) + and r["kv_cache_dtype"] != "fp8") +qwen_len.sort(key=lambda r: r["max_model_len"]) + +W("| Model | max_model_len | Actual KV (GiB) | KV err | Actual tokens | Pred tokens | Token err | Max conc err |") +W("|-------|:-------------:|:---------------:|:------:|:-------------:|:-----------:|:---------:|:------------:|") +for r in llama_len + qwen_len: + model_short = r["model"].split("/")[-1][:28] + W(f"| {model_short} | {r['max_model_len']:,} | " + f"{fv(r['actual_kv_cache'])} | {fmt(r['err_kv_cache'])} | " + f"{int(r['actual_kv_tokens']):,} | {int(r['pred_kv_tokens']):,} | " + f"{fmt(r['err_kv_tokens'])} | {fmt(r['err_max_concurrency'])} |") + +W("") +W("**Conclusion**: `--max-model-len` has **no effect on KV pool size** — the formula and " + "vLLM agree on this. Activation memory is constant (the fixed profiling overhead does not " + "depend on context length), so the KV pool prediction error stays flat at ~−3 to −4% " + "regardless of whether context is 2 K or 32 K tokens. The token/concurrency predictions " + "carry that same constant KV error forward, plus any error from the per-token KV formula.") + +# ── TP ──────────────────────────────────────────────────────────────────────── +W("\n### `--tensor-parallel-size` (TP)\n") + +tp_sweep = where(lambda r: r["model"] == "meta-llama/Llama-3.1-8B-Instruct" + and r["max_model_len"] == 8192 + and r["quantization"] in ("None", "", None) + and r["kv_cache_dtype"] != "fp8" + and r["pp"] == 1) +tp_sweep.sort(key=lambda r: r["tp"]) + +qwen_tp = where(lambda r: r["model"] == "Qwen/Qwen2.5-7B-Instruct" + and r["max_model_len"] == 8192 + and r["quantization"] in ("None", "", None) + and r["kv_cache_dtype"] != "fp8" + and r["pp"] == 1) +qwen_tp.sort(key=lambda r: r["tp"]) + +W("| Model | TP | Actual weight (GiB) | Weight err | Actual activ (GiB) | Activation err | Actual non-torch (GiB) | Non-torch err | KV cache err |") +W("|-------|:--:|:-------------------:|:----------:|:------------------:|:--------------:|:---------------------:|:-------------:|:------------:|") +for r in tp_sweep + qwen_tp: + model_short = r["model"].split("/")[-1][:22] + W(f"| {model_short} | {r['tp']} | " + f"{fv(r['actual_weight'])} | {fmt(r['err_weight'])} | " + f"{fv(r['actual_activation'])} | {fmt(r['err_activation'])} | " + f"{fv(r['actual_non_torch'])} | {fmt(r['err_non_torch'])} | " + f"{fmt(r['err_kv_cache'])} |") + +W("") +W("**Conclusions**:\n") +W("- **Weights scale correctly**: the formula divides by TP, matching vLLM's per-GPU weight sharding. " + "Weight error stays near 0% across TP=1–4.") +W("- **Activation is TP-invariant in both formula and reality**: vLLM's profiling overhead does not " + "shrink with TP (it captures the same set of batch sizes). The formula also keeps activation " + "constant with TP. Error stays flat.") +W("- **Non-torch is heavily under-estimated for TP≥2**: the 0.60 GiB/GPU constant does not capture " + "NCCL all-reduce buffer overhead, which grows with TP. Actual non-torch reaches ~2.1 GiB/GPU at " + "TP=4 (3.5× the constant). However, this error is partially masked in KV cache accuracy because " + "the over-estimated activation pulls the prediction in the opposite direction.") + +# ── PP ──────────────────────────────────────────────────────────────────────── +W("\n### `--pipeline-parallel-size` (PP)\n") + +pp_sweep = where(lambda r: r["model"] == "meta-llama/Llama-3.1-8B-Instruct" + and r["max_model_len"] == 8192 + and r["quantization"] in ("None", "", None) + and r["kv_cache_dtype"] != "fp8" + and r["tp"] == 1) +pp_sweep.sort(key=lambda r: r["pp"]) + +W("| PP | Actual weight (GiB) | Weight err | Actual activ (GiB) | Activation err | Actual non-torch (GiB) | Non-torch err | KV cache err |") +W("|:--:|:-------------------:|:----------:|:------------------:|:--------------:|:---------------------:|:-------------:|:------------:|") +for r in pp_sweep: + W(f"| {r['pp']} | " + f"{fv(r['actual_weight'])} | {fmt(r['err_weight'])} | " + f"{fv(r['actual_activation'])} | {fmt(r['err_activation'])} | " + f"{fv(r['actual_non_torch'])} | {fmt(r['err_non_torch'])} | " + f"{fmt(r['err_kv_cache'])} |") + +# Compute activation values directly for the prose +pp_acts = {r["pp"]: r["actual_activation"] for r in pp_sweep} +pp_preds = {r["pp"]: r["pred_activation"] for r in pp_sweep} +W("") +W("**Conclusions**:\n") +W(f"- **Activation drops sharply with PP**: at PP=1, vLLM profiles {fv(pp_acts.get(1,float('nan')))} GiB " + f"of activation; at PP=2 it drops to {fv(pp_acts.get(2,float('nan')))} GiB; " + f"at PP=4 to {fv(pp_acts.get(4,float('nan')))} GiB. " + "Each pipeline stage runs fewer transformer layers, so the profiling sweep allocates proportionally less. " + f"The formula does not account for this and always predicts {fv(pp_preds.get(1,float('nan')))} GiB, " + "making the activation error grow with PP (from ~+154% at PP=1 to ~+357% at PP=4).") +W("- **Non-torch increases with PP** due to inter-stage P2P send/receive buffers, " + "but the formula uses the same TP=1 constant (0.15 GiB/GPU) regardless of PP, " + "causing the non-torch estimate to overshoot actual (predicted > actual for PP>1 because " + "each stage is a separate process and 0.15 is per-GPU). " + "These two errors partially offset each other in the KV cache prediction.") +W("- **Weight error grows with PP**: the formula divides only by TP×PP for weight sharding, " + "but with PP=4, model layers are not uniformly distributed across stages in all cases " + "(irregular last-stage allocation can leave a stage with fewer params).") + +# ── dtype ───────────────────────────────────────────────────────────────────── +W("\n### `--dtype` (compute/storage dtype override)\n") + +dtype_sweep = where(lambda r: "Llama-3.1" in r["model"] + and r["tp"] == 1 and r["pp"] == 1 and r["max_model_len"] == 8192) +dtype_sweep.sort(key=lambda r: (r["dtype"], r["quantization"], r["kv_cache_dtype"])) + +W("| dtype arg | quantization | kv_cache_dtype | Actual weight (GiB) | Weight err | Actual KV (GiB) | KV err |") +W("|-----------|:------------:|:--------------:|:-------------------:|:----------:|:---------------:|:------:|") +for r in dtype_sweep: + W(f"| {r['dtype'].replace('torch.', '')} | {r['quantization']} | " + f"{r['kv_cache_dtype']} | " + f"{fv(r['actual_weight'])} | {fmt(r['err_weight'])} | " + f"{fv(r['actual_kv_cache'])} | {fmt(r['err_kv_cache'])} |") + +W("") +W("**Conclusions**:\n") +W("- **`--dtype float32`** doubles model weight memory (29.98 GiB vs BF16's 14.99 GiB). " + "The planner reads the HuggingFace config dtype (BF16) and is unaware of the `--dtype` " + "vLLM override, so it predicts 14.96 GiB — a **−50% weight error**, which cascades into " + "a +31% KV cache over-prediction (the planner thinks there is more room than there is).") +W("- **`--dtype float16`** is handled correctly because the HuggingFace config also stores " + "float16 for these models; weight error stays near 0%.") +W("- **FP8-dynamic quantization** (`fp8` in the quantization column) halves weight memory. " + "The planner reads `quantization_config` from the HuggingFace repo and applies the FP8 " + "byte-per-param, yielding near-zero weight error. KV cache error stays consistent with " + "the activation over-estimation.") +W("- **`--kv-cache-dtype fp8`** does not affect weight or activation predictions, but halves " + "per-token KV storage. The planner ignores this flag and predicts KV tokens ~50% too low " + "(see dedicated section below).") + +# ── quantization ────────────────────────────────────────────────────────────── +W("\n### `--quantization` (weight quantization method)\n") + +quant_rows = where(lambda r: r["quantization"] not in ("None", "", None)) +quant_rows.sort(key=lambda r: (r["quantization"], r["model"])) + +W("| Model | quant method | TP | Actual weight (GiB) | Weight err | Actual KV (GiB) | KV err |") +W("|-------|--------------|----|:-------------------:|:----------:|:---------------:|:------:|") +for r in quant_rows: + model_short = r["model"].split("/")[-1][:30] + W(f"| {model_short} | {r['quantization']} | {r['tp']} | " + f"{fv(r['actual_weight'])} | {fmt(r['err_weight'])} | " + f"{fv(r['actual_kv_cache'])} | {fmt(r['err_kv_cache'])} |") + +W("") +W("**Conclusions**:\n") +W("- **w8a8 (compressed-tensors INT8)**: the planner parses `config_groups` from the " + "`quantization_config` to find `num_bits=8` and applies 1 byte/param. Weight errors " + "are near zero (−0.3 to −0.7%), indicating the INT8 parameter count is well-captured.") +W("- **w4a16 (GPTQ-marlin INT4)**: the planner parses `num_bits=4` from the quantization " + "config and applies 0.5 bytes/param. Weight error is small (~−0.7%). " + "The large reduction in weights (5.3 GiB vs 15 GiB for BF16) frees more KV cache, " + "and the planner correctly tracks this effect — KV error stays in the −3% range.") +W("- **fp8-dynamic** (fp8 per-tensor dynamic quant via `compressed-tensors`): " + "the planner extracts fp8 precision from the quantization config. " + "Weight error is near zero. Unexpectedly, weight error for the RedHat fp8 70B model " + "at TP=2 stays very low, confirming the quant config parsing is correct for this variant.") + +# ── kv_cache_dtype ──────────────────────────────────────────────────────────── +W("\n### `--kv-cache-dtype` (KV cache precision)\n") + +kv_dtype_rows = where(lambda r: r["kv_cache_dtype"] == "fp8") +kv_dtype_rows.sort(key=lambda r: r["model"]) + +# Find the matching default-kv rows for the same model +kv_default_rows = [] +for kfp8 in kv_dtype_rows: + match = where(lambda r, m=kfp8: (r["model"] == m["model"] + and r["tp"] == m["tp"] + and r["pp"] == m["pp"] + and r["max_model_len"] == m["max_model_len"] + and r["kv_cache_dtype"] != "fp8" + and r["quantization"] in ("None","",None))) + if match: + kv_default_rows.append(match[0]) + +W("| Model | kv_cache_dtype | Actual KV (GiB) | KV err | Actual tokens | Pred tokens | Token err | Conc err |") +W("|-------|:--------------:|:---------------:|:------:|:-------------:|:-----------:|:---------:|:--------:|") +for row_pair in zip(kv_default_rows, kv_dtype_rows): + for r in row_pair: + model_short = r["model"].split("/")[-1][:28] + W(f"| {model_short} | {r['kv_cache_dtype']} | " + f"{fv(r['actual_kv_cache'])} | {fmt(r['err_kv_cache'])} | " + f"{int(r['actual_kv_tokens']):,} | {int(r['pred_kv_tokens']):,} | " + f"{fmt(r['err_kv_tokens'])} | {fmt(r['err_max_concurrency'])} |") + W("|||||||||") + +W("") +W("**Conclusion**: `--kv-cache-dtype fp8` stores each KV element in 1 byte instead of 2 bytes " + "(BF16/FP16), doubling the number of tokens that fit in the KV pool. The KV pool size in GiB " + "is unaffected (same activation and weight overhead), so the **KV GiB error stays near −4%** " + "(the same as the default-dtype baseline). But because the planner always computes per-token " + "bytes from the model's native compute dtype, **token count and max-concurrency predictions " + "are ~52% too low** for fp8-KV runs. This is a direct, fixable bug: the planner should accept " + "`kv_cache_dtype` as an input parameter and apply 1 byte/token when it is `fp8`.") + +# ── Root Cause Summary ──────────────────────────────────────────────────────── +W("\n## Root Cause Analysis\n") + +W("### 1. Activation Memory — Largest Error Source\n") +W("The planner uses **fixed constants per architecture** (e.g., 4.8 GiB for Llama, " + "5.6 GiB for Qwen2/3) empirically measured at `max_model_len=16000`. " + "vLLM v0.19.0 reports substantially lower values during its profiling phase:\n") +W("| Architecture | Predicted (GiB) | Observed range (GiB) | Error range |") +W("|-------------|:---------------:|:--------------------:|:-----------:|") +archs_seen: dict[str, list] = {} +for r in rows_data: + arch = r["architecture"] + if not math.isnan(r["err_activation"]): + archs_seen.setdefault(arch, []).append( + (r["actual_activation"], r["pred_activation"], r["err_activation"])) +for arch, data in sorted(archs_seen.items()): + acts = [d[0] for d in data] + preds = [d[1] for d in data] + errs = [d[2] for d in data] + arch_label = (arch.replace("ForCausalLM", "") + .replace("ForConditionalGeneration", "*"))[:35] + W(f"| {arch_label} | {fv(statistics.mean(preds))} | " + f"{fv(min(acts))}–{fv(max(acts))} | " + f"{fmt(min(errs))} to {fmt(max(errs))} |") +W("") +W("The discrepancy suggests the constants were measured with an older vLLM version or " + "different compilation settings. Re-calibrating to these v0.19.0 measurements would be " + "the highest-value fix.") + +W("\n### 2. Non-torch Memory — Underestimated for Multi-GPU\n") +W("| TP | PP | Constant used | Actual mean (GiB) | Mean error |") +W("|:--:|:--:|:-------------:|:-----------------:|:----------:|") +for tp_v, pp_v in [(1,1),(1,2),(1,4),(2,1),(4,1)]: + grp = where(lambda r, t=tp_v, p=pp_v: r["tp"]==t and r["pp"]==p) + if not grp: + continue + const = 0.15 if tp_v == 1 else 0.60 + acts = [r["actual_non_torch"] for r in grp if not math.isnan(r["actual_non_torch"])] + errs = [r["err_non_torch"] for r in grp if not math.isnan(r["err_non_torch"])] + if not acts: + continue + W(f"| {tp_v} | {pp_v} | {const} GiB | {fv(statistics.mean(acts))} | " + f"{fmt(statistics.mean(errs))} |") + +W("\nFor TP=1 the formula slightly under-estimates (0.15 vs ~0.25 GiB actual). " + "For TP≥2, NCCL all-reduce buffers push actual non-torch to ~2.1 GiB — 3.5× " + "the 0.60 GiB constant. For PP≥2, P2P send/receive adds overhead that the formula " + "doesn't model at all.") + +W("\n### 3. GPU Memory Catalog vs Physical\n") +W("The planner uses 80 GiB (catalog) but H100 physical VRAM is 79.19 GiB:\n") +W("- Catalog available: 80 × 0.95 = **76.00 GiB**") +W("- Physical available: 79.19 × 0.95 = **75.23 GiB**") +W("- Systematic KV over-prediction from this source alone: **+0.77 GiB**") + +W("\n### 4. CUDA Graph Memory — Excluded from Formula\n") +cg_vals = [r["actual_cuda_graph"] for r in rows_data + if not math.isnan(r["actual_cuda_graph"]) and r["actual_cuda_graph"] > 0] +W("The planner returns 0.0 GiB for CUDA graphs (treating it as included in activation). " + "vLLM allocates the CUDA graph pool *after* sizing the KV cache, so the reported " + "KV pool includes CUDA graph memory. The formula is therefore consistent with the " + "log-reported KV number — no fix needed, but it should be documented.") +if cg_vals: + W(f"\nObserved CUDA graph pool sizes: {fv(min(cg_vals))}–{fv(max(cg_vals))} GiB " + f"(mean {fv(statistics.mean(cg_vals))} GiB).") + +# ── Recommendations ─────────────────────────────────────────────────────────── +W("\n## Recommendations\n") +W("| Priority | Fix | Expected impact |") +W("|:--------:|-----|:---------------:|") +W("| 🔴 High | **Re-calibrate activation constants** from v0.19.0 measurements. " + "Current constants are 2–7× too high. Updating to ~1.0–2.2 GiB/architecture would " + "remove the single largest prediction error. | +4–10 GiB KV accuracy |") +W("| 🔴 High | **Accept `--kv-cache-dtype` as a planner input.** When set to `fp8`, " + "halve the per-token KV bytes. This is a one-line formula change. " + "| 2× token/concurrency accuracy for fp8-KV runs |") +W("| 🔴 High | **Accept `--dtype` as a planner input.** When set to `float32`, " + "double the per-param bytes for weight estimation. " + "| Fixes −50% weight error for float32 runs |") +W("| 🟡 Medium | **Re-measure non-torch constants for TP≥2 and PP≥2.** " + "NCCL overhead scales with both and is currently under-estimated by ~3.5×. " + "| +1–2 GiB KV accuracy for multi-GPU |") +W("| 🟡 Medium | **Scale activation constant by 1/PP.** " + "Each pipeline stage processes layers/PP transformer blocks; " + "profiling overhead scales proportionally. " + "| Fixes growing activation error at high PP |") +W("| 🟢 Low | **Use physical GPU memory** (79.19 GiB for H100) rather than " + "the catalog 80 GiB nominal. | +0.77 GiB KV accuracy |") + +report = "\n".join(lines) +OUT_MD.write_text(report) +print(f"Report written → {OUT_MD}") +print(f"\n{'─'*60}") +print("HEADLINE NUMBERS") +print(f"{'─'*60}") +print(f" KV cache mean error (all): {fmt(kv_mean_all)}") +print(f" KV cache mean error (baseline): {fmt(kv_mean_base)}") +print(f" Weights mean abs error: {fmt(wt_abs)}") +print(f" Activation mean error: {fmt(act_mean)}") +print(f" Activation mean abs error: {fmt(act_abs)}") +print(f" Max concurrency mean error: {fmt(conc_mean)}") +print(f" Max concurrency mean abs error: {fmt(conc_abs)}") diff --git a/accuracy/scripts/deep_analysis.py b/accuracy/scripts/deep_analysis.py deleted file mode 100644 index 985d1365..00000000 --- a/accuracy/scripts/deep_analysis.py +++ /dev/null @@ -1,397 +0,0 @@ -""" -Deep percent-error analysis of capacity-planner predictions vs vLLM measurements. - -Reads results.csv produced by analyze.py and writes a detailed markdown report -broken down by model, model family, TP degree, and quantization. - -Usage: - python deep_analysis.py \ - --csv accuracy/results/v0.19.0/results.csv \ - --out accuracy/results/v0.19.0/deep_analysis.md -""" -import argparse -import csv -import math -import statistics -from collections import defaultdict -from pathlib import Path - - -# --------------------------------------------------------------------------- -# Data loading -# --------------------------------------------------------------------------- - -def load_csv(path: str) -> list[dict]: - rows = [] - for r in csv.DictReader(open(path)): - for f in ("tp", "pp", "dp", "max_model_len"): - if r[f]: - r[f] = int(r[f]) - for f in ("weight_error_pct", "activation_error_pct", - "non_torch_error_pct", "kv_cache_error_pct"): - r[f] = float(r[f]) if r[f] else None - rows.append(r) - return rows - - -def family(model: str) -> str: - """Coarse family label for grouping.""" - m = model.lower() - if "llama-4" in m: return "Llama-4" - if "llama-3.3" in m: return "Llama-3.3" - if "llama-3.1" in m or "llama-3-1" in m: return "Llama-3.1" - if "llama" in m: return "Llama (other)" - if "qwen3" in m: return "Qwen3" - if "qwen2.5" in m or "qwen2-5" in m: return "Qwen2.5" - if "qwen2" in m: return "Qwen2" - if "qwen" in m: return "Qwen (other)" - if "deepseek" in m: return "DeepSeek" - if "mistral-small" in m: return "Mistral-Small" - if "mixtral" in m: return "Mixtral" - if "phi" in m: return "Phi" - if "granite-vision" in m: return "Granite-Vision" - if "granite" in m: return "Granite" - if "kimi-vl" in m: return "Kimi-VL" - if "kimi" in m: return "Kimi" - if "gpt-oss" in m: return "GPT-OSS (openai)" - return model.split("/")[0] - - -def arch_type(model: str) -> str: - m = model.lower() - if any(x in m for x in ["mixtral", "qwen3-30b-a3b", "kimi-dev", "gpt-oss", - "deepseek-v2", "llama-4-scout"]): - return "MoE" - if any(x in m for x in ["granite-vision", "kimi-vl"]): - return "Multimodal" - return "Dense" - - -# --------------------------------------------------------------------------- -# Stats helpers -# --------------------------------------------------------------------------- - -def _stats(values: list[float]) -> dict: - if not values: - return {} - return { - "n": len(values), - "mean": statistics.mean(values), - "median": statistics.median(values), - "stdev": statistics.stdev(values) if len(values) > 1 else 0.0, - "min": min(values), - "max": max(values), - "mae": statistics.mean(abs(v) for v in values), - "within5": sum(1 for v in values if abs(v) <= 5) / len(values) * 100, - "within10": sum(1 for v in values if abs(v) <= 10) / len(values) * 100, - } - - -def fmt_pct(v: float | None, decimals: int = 1) -> str: - if v is None or (isinstance(v, float) and math.isnan(v)): - return "—" - sign = "+" if v > 0 else "" - return f"{sign}{v:.{decimals}f}%" - - -def fmt_stat_row(label: str, s: dict, field: str = "weight") -> str: - if not s: - return f"| {label} | — | — | — | — | — | — |" - return ( - f"| {label} | {s['n']} " - f"| {fmt_pct(s['mean'])} " - f"| {fmt_pct(s['mae'])} " - f"| {fmt_pct(s['min'])} / {fmt_pct(s['max'])} " - f"| {s['within5']:.0f}% " - f"| {s['within10']:.0f}% |" - ) - - -STAT_HEADER = ( - "| Cohort | N | Mean err | MAE | Min / Max | ≤5% | ≤10% |", - "|---|---|---|---|---|---|---|", -) - - -# --------------------------------------------------------------------------- -# Report sections -# --------------------------------------------------------------------------- - -def section_executive_summary(rows: list[dict]) -> list[str]: - w_vals = [r["weight_error_pct"] for r in rows if r["weight_error_pct"] is not None] - k_vals = [r["kv_cache_error_pct"] for r in rows if r["kv_cache_error_pct"] is not None] - ws = _stats(w_vals) - ks = _stats(k_vals) - - lines = [ - "## Executive Summary\n", - f"**Runs analyzed**: {len(rows)} across {len({r['model'] for r in rows})} models " - f"on {len({r['gpu'] for r in rows})} GPU type(s).\n", - "### Overall accuracy\n", - *STAT_HEADER, - fmt_stat_row("Weight memory", ws), - fmt_stat_row("KV cache memory", ks), - "", - ] - return lines - - -def section_per_model(rows: list[dict]) -> list[str]: - lines = ["## Per-model breakdown\n"] - by_model = defaultdict(list) - for r in rows: - by_model[r["model"]].append(r) - - for model in sorted(by_model): - mrs = by_model[model] - fam = family(model) - atype = arch_type(model) - lines.append(f"### {model} _{fam} · {atype}_\n") - lines += [ - "| TP | PP | DP | max_len | dtype | quant | kv_dtype " - "| Weight err | KV err |", - "|---|---|---|---|---|---|---|---|---|", - ] - for r in sorted(mrs, key=lambda x: (x["tp"], x["pp"], x["max_model_len"])): - lines.append( - f"| {r['tp']} | {r['pp']} | {r['dp']} | {r['max_model_len']} " - f"| {r['dtype'] or 'auto'} " - f"| {r['quantization'] or '—'} " - f"| {r['kv_cache_dtype'] or 'auto'} " - f"| {fmt_pct(r['weight_error_pct'])} " - f"| {fmt_pct(r['kv_cache_error_pct'])} |" - ) - lines.append("") - return lines - - -def section_per_family(rows: list[dict]) -> list[str]: - lines = ["## Per-model-family accuracy\n", *STAT_HEADER] - by_fam: dict[str, list] = defaultdict(list) - for r in rows: - by_fam[family(r["model"])].append(r) - - for fam in sorted(by_fam): - frows = by_fam[fam] - w_vals = [r["weight_error_pct"] for r in frows if r["weight_error_pct"] is not None] - k_vals = [r["kv_cache_error_pct"] for r in frows if r["kv_cache_error_pct"] is not None] - lines.append(fmt_stat_row(f"**{fam}** — weight", _stats(w_vals))) - lines.append(fmt_stat_row(f"**{fam}** — KV", _stats(k_vals))) - lines.append("") - return lines - - -def section_by_arch_type(rows: list[dict]) -> list[str]: - lines = ["## By architecture type\n", *STAT_HEADER] - by_type: dict[str, list] = defaultdict(list) - for r in rows: - by_type[arch_type(r["model"])].append(r) - - for atype in ("Dense", "MoE", "Multimodal"): - trows = by_type.get(atype, []) - if not trows: - continue - w_vals = [r["weight_error_pct"] for r in trows if r["weight_error_pct"] is not None] - k_vals = [r["kv_cache_error_pct"] for r in trows if r["kv_cache_error_pct"] is not None] - lines.append(fmt_stat_row(f"**{atype}** — weight", _stats(w_vals))) - lines.append(fmt_stat_row(f"**{atype}** — KV", _stats(k_vals))) - lines.append("") - return lines - - -def section_tp_sensitivity(rows: list[dict]) -> list[str]: - lines = [ - "## TP sensitivity\n", - "_KV cache error grouped by tensor-parallel degree (all models). " - "After applying the per-GPU normalisation (÷TP×PP)._\n", - *STAT_HEADER, - ] - by_tp: dict[int, list] = defaultdict(list) - for r in rows: - if r["kv_cache_error_pct"] is not None: - by_tp[r["tp"]].append(r["kv_cache_error_pct"]) - for tp in sorted(by_tp): - lines.append(fmt_stat_row(f"TP={tp}", _stats(by_tp[tp]))) - lines.append("") - - lines += [ - "## PP sensitivity\n", - "_KV cache error grouped by pipeline-parallel degree._\n", - *STAT_HEADER, - ] - by_pp: dict[int, list] = defaultdict(list) - for r in rows: - if r["kv_cache_error_pct"] is not None: - by_pp[r["pp"]].append(r["kv_cache_error_pct"]) - for pp in sorted(by_pp): - lines.append(fmt_stat_row(f"PP={pp}", _stats(by_pp[pp]))) - lines.append("") - return lines - - -def section_context_len_sensitivity(rows: list[dict]) -> list[str]: - # Only include runs from models that were tested across multiple context lengths - tested = defaultdict(list) - for r in rows: - if r["tp"] == 1 and r["pp"] == 1: - tested[r["model"]].append(r) - multi = {m: rs for m, rs in tested.items() if len({r["max_model_len"] for r in rs}) > 1} - - if not multi: - return [] - - lines = [ - "## Context-length sensitivity (TP=1 runs only)\n", - "_Models tested at multiple max_model_len values. " - "KV cache error should be constant if the formula is context-length-agnostic._\n", - ] - for model in sorted(multi): - lines.append(f"**{model}**\n") - lines += ["| max_len | KV err |", "|---|---|"] - for r in sorted(multi[model], key=lambda x: x["max_model_len"]): - lines.append(f"| {r['max_model_len']} | {fmt_pct(r['kv_cache_error_pct'])} |") - lines.append("") - return lines - - -def section_quantization(rows: list[dict]) -> list[str]: - quant_rows = [r for r in rows if r["quantization"]] - if not quant_rows: - return [] - lines = [ - "## Quantization\n", - "| Model | Quant | TP | Weight err | KV err |", - "|---|---|---|---|---|", - ] - for r in sorted(quant_rows, key=lambda x: (x["model"], x["quantization"], x["tp"])): - lines.append( - f"| {r['model']} | {r['quantization']} | {r['tp']} " - f"| {fmt_pct(r['weight_error_pct'])} | {fmt_pct(r['kv_cache_error_pct'])} |" - ) - lines.append("") - return lines - - -def section_outliers(rows: list[dict], threshold: float = 10.0) -> list[str]: - outliers = [ - r for r in rows - if (r["weight_error_pct"] is not None and abs(r["weight_error_pct"]) > threshold) - or (r["kv_cache_error_pct"] is not None and abs(r["kv_cache_error_pct"]) > threshold) - ] - lines = [f"## Outliers (|error| > {threshold:.0f}%)\n"] - if not outliers: - lines.append(f"_No outliers exceeding ±{threshold:.0f}%._\n") - return lines - - lines += [ - "| Model | TP | PP | Weight err | KV err | Likely cause |", - "|---|---|---|---|---|---|", - ] - for r in sorted(outliers, key=lambda x: abs(x["kv_cache_error_pct"] or 0), reverse=True): - we = r["weight_error_pct"] - ke = r["kv_cache_error_pct"] - m = r["model"].lower() - - cause = "unknown" - if ke is not None and ke < -20: - if "70b" in m or "72b" in m: - cause = "large model: activation constant may underestimate real overhead" - elif "30b" in m and "moe" in arch_type(r["model"]).lower() or "a3b" in m: - cause = "MoE: routing overhead not modeled in activation/KV budget" - else: - cause = "overhead underestimated; check activation/non-torch constants" - elif ke is not None and ke > 20: - if r["tp"] >= 2 or r["pp"] >= 2: - cause = "TP/PP residual: per-GPU normalisation may be imprecise" - else: - cause = "KV formula overestimates available budget" - if we is not None and abs(we) > 10: - if r["pp"] >= 4: - cause = "PP≥4: weight sharding formula incorrect for high PP" - elif "moe" in arch_type(r["model"]).lower() or "gpt-oss" in m or "llama-4" in m: - cause = "MoE/sparse model: shared expert / embedding memory not sharded by TP" - - lines.append( - f"| {r['model']} | {r['tp']} | {r['pp']} " - f"| {fmt_pct(we)} | {fmt_pct(ke)} | {cause} |" - ) - lines.append("") - return lines - - -def section_calibration_notes(rows: list[dict]) -> list[str]: - w_vals = [r["weight_error_pct"] for r in rows if r["weight_error_pct"] is not None] - k_vals = [r["kv_cache_error_pct"] for r in rows if r["kv_cache_error_pct"] is not None] - - tp1 = [r for r in rows if r["tp"] == 1 and r["pp"] == 1] - k_tp1 = [r["kv_cache_error_pct"] for r in tp1 if r["kv_cache_error_pct"] is not None] - - lines = [ - "## Calibration notes\n", - "### Weight memory\n", - f"- Mean error {fmt_pct(statistics.mean(w_vals) if w_vals else None)} — " - "slightly negative (planner underestimates). " - "Cause: safetensors metadata reports storage dtype; " - "actual in-memory size can differ due to alignment/padding.\n", - "- PP≥4 and certain MoE models show >10% weight error — " - "embedding and shared-expert tensors may not be sharded by TP/PP " - "as assumed by the formula.\n", - "### KV cache memory (TP=1)\n", - f"- TP=1 KV mean error {fmt_pct(statistics.mean(k_tp1) if k_tp1 else None)} " - f"(MAE {fmt_pct(statistics.mean(abs(v) for v in k_tp1) if k_tp1 else None)}). " - "Mostly within ±10%.\n", - "- Consistent negative bias across TP=1 configs suggests activation_memory " - "constant is slightly too high (over-reserves budget, leaving less for KV).\n", - "### KV cache memory (TP>1)\n", - "- After ÷(TP×PP) normalisation, errors are within ±10% for most models.\n", - "- Remaining positive bias at TP=2/4 is consistent with extra NCCL/all-gather " - "buffers not captured by non_torch constant.\n", - "### Large-model KV outliers\n", - "- `Qwen3-30B-A3B` (TP=1): −29%. MoE routing buffers consume more memory than modeled.\n", - "- `Llama-3.3-70B-w8a8` (TP=1): −33%. W8A8 quantization increases activation-memory " - "footprint (dequant workspace) not accounted for in constant.\n", - "- `Kimi-Dev-72B` (TP=2): +62%. Likely residual normalisation issue or " - "model-specific memory layout.\n", - "- `Qwen2.5-72B` (TP=2): +61%. Same pattern as Kimi-Dev-72B — " - "large model at TP=2 still shows excess after normalisation.\n", - ] - return lines - - -# --------------------------------------------------------------------------- -# Main -# --------------------------------------------------------------------------- - -def generate_report(rows: list[dict]) -> str: - parts: list[list[str]] = [ - ["# Capacity Planner — Deep Accuracy Analysis\n", - f"_vLLM v0.19.0 · H100-80GB · {len(rows)} runs · " - f"{len({r['model'] for r in rows})} models_\n"], - section_executive_summary(rows), - section_by_arch_type(rows), - section_per_family(rows), - section_tp_sensitivity(rows), - section_context_len_sensitivity(rows), - section_quantization(rows), - section_outliers(rows), - section_calibration_notes(rows), - section_per_model(rows), - ] - return "\n".join(line for section in parts for line in section) - - -def main() -> None: - ap = argparse.ArgumentParser() - ap.add_argument("--csv", required=True) - ap.add_argument("--out", required=True) - args = ap.parse_args() - - rows = load_csv(args.csv) - report = generate_report(rows) - Path(args.out).parent.mkdir(parents=True, exist_ok=True) - Path(args.out).write_text(report) - print(f"Deep analysis written to {args.out} ({len(rows)} rows)") - - -if __name__ == "__main__": - main() diff --git a/accuracy/scripts/parse_log.py b/accuracy/scripts/parse_log.py index f96cb0ca..247031d8 100644 --- a/accuracy/scripts/parse_log.py +++ b/accuracy/scripts/parse_log.py @@ -37,6 +37,17 @@ "vllm_commit": r"\(commit:\s*([0-9a-f]+)\)", } +# vLLM v0.19 memory profiling line (optional — only present in DEBUG logs). +# Multiple workers emit identical values; we take the first match. +# Example: "Total non KV cache memory: 17.12GiB; torch peak memory increase: 1.89GiB; +# non-torch forward increase memory: 0.25GiB; weights memory: 14.99GiB." +_PROFILING_PATTERN = re.compile( + r"Total non KV cache memory:\s*([\d.]+)GiB;" + r"\s*torch peak memory increase:\s*([\d.]+)GiB;" + r"\s*non-torch forward increase memory:\s*([\d.]+)GiB;" + r"\s*weights memory:\s*([\d.]+)GiB" +) + _VLLM_BLOCK_SIZE = 16 # tokens per KV block, constant in vLLM v0.19.0 @@ -72,6 +83,15 @@ def parse(log_path: str | Path) -> dict[str, Any]: m = re.search(pattern, text) result[field] = m.group(1) if m else None + # Memory profiling breakdown (vLLM v0.19 DEBUG line, optional) + mp = _PROFILING_PATTERN.search(text) + if mp: + result["total_non_kv_memory_gib"] = float(mp.group(1)) + result["activation_memory_gib"] = float(mp.group(2)) + result["non_torch_forward_memory_gib"] = float(mp.group(3)) + # weights_memory from profiling line should match weight_memory_gib; keep for cross-check + result["profiling_weights_memory_gib"] = float(mp.group(4)) + # Derived fields if "kv_cache_tokens" in result: result["kv_cache_blocks"] = result["kv_cache_tokens"] // _VLLM_BLOCK_SIZE diff --git a/accuracy/scripts/parse_logs.py b/accuracy/scripts/parse_logs.py new file mode 100644 index 00000000..fde91cfb --- /dev/null +++ b/accuracy/scripts/parse_logs.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python3 +"""Parse vLLM v0.19.0 startup logs to extract memory metrics into a CSV.""" + +import csv +import re +import sys +from pathlib import Path + + +def parse_log(log_path: Path) -> dict: + text = log_path.read_text(errors="replace") + lines = text.splitlines() + + row = {"log_file": log_path.name, "status": "ok"} + + # ── Config: Initializing a V1 LLM engine line ────────────────────────── + m = re.search( + r"Initializing a V1 LLM engine.*?model='([^']+)'.*?" + r"dtype=([^,]+).*?max_seq_len=(\d+).*?" + r"tensor_parallel_size=(\d+).*?pipeline_parallel_size=(\d+).*?" + r"data_parallel_size=(\d+).*?quantization=([^,]+).*?" + r"kv_cache_dtype=([^,\)]+)", + text, + ) + if m: + row["model"] = m.group(1) + row["dtype"] = m.group(2).strip() + row["max_model_len"] = int(m.group(3)) + row["tp"] = int(m.group(4)) + row["pp"] = int(m.group(5)) + row["dp"] = int(m.group(6)) + row["quantization"] = m.group(7).strip() + row["kv_cache_dtype"] = m.group(8).strip() + + # ── Config: gpu_memory_utilization from non-default args ─────────────── + m = re.search(r"gpu_memory_utilization':\s*([0-9.]+)", text) + if not m: + m = re.search(r"Desired GPU memory utilization is \(([0-9.]+),", text) + if m: + row["gpu_memory_utilization"] = float(m.group(1)) + + # ── GPU type from filename (e.g. h100-80gb) ──────────────────────────── + gpu_m = re.search(r"--([a-z0-9]+-\d+gb)--", log_path.name, re.I) + row["gpu"] = gpu_m.group(1).upper() if gpu_m else "" + + # ── Worker init memory snapshot (take first occurrence) ─────────────── + m = re.search( + r"worker init memory snapshot: torch_peak=([0-9.]+)GiB, " + r"free_memory=([0-9.]+)GiB, total_memory=([0-9.]+)GiB, " + r"cuda_memory=([0-9.]+)GiB, torch_memory=([0-9.]+)GiB, " + r"non_torch_memory=([0-9.]+)GiB", + text, + ) + if m: + row["init_free_memory_gib"] = float(m.group(2)) + row["init_total_memory_gib"] = float(m.group(3)) + row["init_cuda_memory_gib"] = float(m.group(4)) + row["init_non_torch_memory_gib"] = float(m.group(6)) + + # ── Worker requested memory (take first) ────────────────────────────── + m = re.search(r"worker requested memory: ([0-9.]+)GiB", text) + if m: + row["requested_memory_gib"] = float(m.group(1)) + + # ── Model loading: weight memory ────────────────────────────────────── + m = re.search(r"Model loading took ([0-9.]+) GiB memory", text) + if m: + row["weight_memory_gib"] = float(m.group(1)) + + # ── Memory profiling breakdown (take first occurrence = TP0 or solo) ── + m = re.search( + r"Memory profiling takes [0-9.]+ seconds\. " + r"Total non KV cache memory: ([0-9.]+)GiB; " + r"torch peak memory increase: ([0-9.]+)GiB; " + r"non-torch forward increase memory: ([0-9.]+)GiB; " + r"weights memory: ([0-9.]+)GiB\.", + text, + ) + if m: + row["total_non_kv_cache_gib"] = float(m.group(1)) + row["activation_memory_gib"] = float(m.group(2)) + row["non_torch_forward_gib"] = float(m.group(3)) + row["weights_memory_gib"] = float(m.group(4)) + + # ── Estimated CUDA graph memory (take first) ────────────────────────── + m = re.search(r"Estimated CUDA graph memory: ([0-9.]+) GiB total", text) + if m: + row["cuda_graph_estimated_gib"] = float(m.group(1)) + + # ── Actual CUDA graph pool (take first) ─────────────────────────────── + m = re.search( + r"CUDA graph pool memory: ([0-9.]+) GiB \(actual\), ([0-9.]+) GiB \(estimated\)", + text, + ) + if m: + row["cuda_graph_actual_gib"] = float(m.group(1)) + + # ── Available KV cache memory (take first = Worker_TP0 or solo) ─────── + m = re.search(r"Available KV cache memory: ([0-9.]+) GiB", text) + if m: + row["kv_cache_memory_gib"] = float(m.group(1)) + + # ── GPU KV cache tokens (EngineCore, single line) ───────────────────── + m = re.search(r"GPU KV cache size: ([\d,]+) tokens", text) + if m: + row["kv_cache_tokens"] = int(m.group(1).replace(",", "")) + + # ── Max concurrency ─────────────────────────────────────────────────── + m = re.search( + r"Maximum concurrency for ([\d,]+) tokens per request: ([0-9.]+)x", text + ) + if m: + row["max_concurrency"] = float(m.group(2)) + + # ── KV cache blocks (from metrics log line) ─────────────────────────── + m = re.search(r"num_gpu_blocks is: (\d+)", text) + if m: + row["kv_cache_blocks"] = int(m.group(1)) + + # ── Free memory on device summary (take first) ──────────────────────── + m = re.search( + r"Free memory on device \(([0-9.]+)/([0-9.]+) GiB\)", text + ) + if m: + row["summary_free_gib"] = float(m.group(1)) + row["summary_total_gib"] = float(m.group(2)) + + # ── PIECEWISE / FULL CUDA graph capture counts ──────────────────────── + m = re.search( + r"Profiling CUDA graph memory: PIECEWISE=(\d+) \(largest=(\d+)\), FULL=(\d+)", + text, + ) + if m: + row["cudagraph_piecewise_count"] = int(m.group(1)) + row["cudagraph_piecewise_largest"] = int(m.group(2)) + row["cudagraph_full_count"] = int(m.group(3)) + + return row + + +COLUMNS = [ + "log_file", + "status", + "model", + "gpu", + "tp", + "pp", + "dp", + "max_model_len", + "dtype", + "quantization", + "kv_cache_dtype", + "gpu_memory_utilization", + # initial GPU state + "init_free_memory_gib", + "init_total_memory_gib", + "init_cuda_memory_gib", + "init_non_torch_memory_gib", + "requested_memory_gib", + # per-GPU memory breakdown + "weight_memory_gib", + "weights_memory_gib", # from profiling line (should match) + "activation_memory_gib", + "non_torch_forward_gib", + "total_non_kv_cache_gib", + "cuda_graph_estimated_gib", + "cuda_graph_actual_gib", + "kv_cache_memory_gib", + # KV cache sizing + "kv_cache_tokens", + "kv_cache_blocks", + "max_concurrency", + # summary + "summary_free_gib", + "summary_total_gib", + "cudagraph_piecewise_count", + "cudagraph_piecewise_largest", + "cudagraph_full_count", +] + + +def main(): + logs_dir = Path(__file__).parent.parent / "results/v0.19.0/logs" + out_path = Path(__file__).parent.parent / "results/v0.19.0/results_raw.csv" + + log_files = sorted(logs_dir.glob("*.log")) + print(f"Found {len(log_files)} log files (including .FAILED.log)") + + rows = [] + for lf in log_files: + row = parse_log(lf) + if lf.name.endswith(".FAILED.log"): + row["status"] = "failed" + rows.append(row) + print(f" {'✓' if row['status'] == 'ok' else '✗'} {lf.name}") + + with open(out_path, "w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=COLUMNS, extrasaction="ignore") + writer.writeheader() + writer.writerows(rows) + + print(f"\nWrote {len(rows)} rows → {out_path}") + + +if __name__ == "__main__": + main() diff --git a/accuracy/scripts/predict_capacity.py b/accuracy/scripts/predict_capacity.py new file mode 100644 index 00000000..3ab0da45 --- /dev/null +++ b/accuracy/scripts/predict_capacity.py @@ -0,0 +1,185 @@ +#!/usr/bin/env python3 +""" +Generate capacity planner predictions for configs in results_raw.csv. + +Calls the capacity_planner module with each row's model/config and outputs +the formula-based predictions for direct comparison with actual log values. +""" + +import csv +import os +import sys +import traceback +from pathlib import Path + +# Add src/ to path so we can import planner modules +REPO_ROOT = Path(__file__).parent.parent.parent +sys.path.insert(0, str(REPO_ROOT / "src")) + +from planner.capacity_planner import ( + KVCacheDetail, + allocatable_kv_cache_memory, + estimate_vllm_activation_memory, + estimate_vllm_cuda_graph_memory, + estimate_vllm_non_torch_memory, + get_model_config_from_hf, + model_memory_req, + per_gpu_model_memory_required, +) + +GPU_MEMORY_GIB = 80 # H100 catalog value +HF_TOKEN = os.environ.get("HF_TOKEN") or None + + +def predict(row: dict) -> dict: + model = row["model"] + tp = int(row["tp"]) + pp = int(row["pp"]) + dp = int(row["dp"]) + max_model_len = int(row["max_model_len"]) + gpu_util = float(row["gpu_memory_utilization"]) + + model_config = get_model_config_from_hf(model, HF_TOKEN) + + # Weight memory + total_weight_gib = model_memory_req(model, model_config, HF_TOKEN) + per_gpu_weight_gib = per_gpu_model_memory_required(model, model_config, tp, pp, HF_TOKEN) + + # Activation (constant per model type, independent of max_model_len) + activation_gib = estimate_vllm_activation_memory(model_config, tp=tp) + + # Non-torch (system overhead) + non_torch_gib = estimate_vllm_non_torch_memory(tp) + + # CUDA graph (included in activation profiling → 0 as separate term) + cuda_graph_gib = estimate_vllm_cuda_graph_memory() + + # Total non-KV per GPU = weight/gpu + activation + non_torch + cuda_graph + total_non_kv_per_gpu = per_gpu_weight_gib + activation_gib + non_torch_gib + cuda_graph_gib + + # Allocatable KV cache: total across tp*pp*dp GPUs + alloc_kv_total = allocatable_kv_cache_memory( + model, + model_config, + GPU_MEMORY_GIB, + gpu_util, + tp, + pp, + dp, + max_model_len=max_model_len, + hf_token=HF_TOKEN, + ) + # Per-GPU KV = total / (tp * pp * dp) — matches what vLLM reports per worker + per_gpu_kv_gib = alloc_kv_total / (tp * pp * dp) + + # KV cache detail: per-token bytes and per-request bytes + kv_detail = KVCacheDetail(model, model_config, context_len=max_model_len) + per_token_bytes = kv_detail.per_token_memory_bytes + per_token_bytes_per_gpu = per_token_bytes / (tp * pp) + + # Derive token count and max concurrency from predicted KV memory + alloc_kv_bytes = per_gpu_kv_gib * (1024**3) + # Account for TP sharding: each GPU holds 1/(tp*pp) of each token's KV + kv_tokens = int(alloc_kv_bytes / per_token_bytes_per_gpu) if per_token_bytes_per_gpu > 0 else 0 + per_request_bytes = per_token_bytes_per_gpu * max_model_len + max_concurrency = alloc_kv_bytes / per_request_bytes if per_request_bytes > 0 else 0 + + arch = "" + if hasattr(model_config, "architectures") and model_config.architectures: + arch = model_config.architectures[0] + + return { + # Config (pass-through) + "model": model, + "gpu": row["gpu"], + "tp": tp, + "pp": pp, + "dp": dp, + "max_model_len": max_model_len, + "dtype": row["dtype"], + "quantization": row["quantization"], + "kv_cache_dtype": row["kv_cache_dtype"], + "gpu_memory_utilization": gpu_util, + # Model architecture + "architecture": arch, + "attention_type": kv_detail.attention_type.value, + "num_hidden_layers": kv_detail.num_hidden_layers, + "num_kv_heads": kv_detail.num_key_value_heads, + "head_dimension": kv_detail.head_dimension, + "kv_dtype_bytes": kv_detail.precision_in_bytes, + "per_token_kv_bytes": per_token_bytes, + "per_token_kv_bytes_per_gpu": per_token_bytes_per_gpu, + # Memory predictions (per GPU) + "pred_weight_memory_gib": round(per_gpu_weight_gib, 4), + "pred_activation_memory_gib": round(activation_gib, 4), + "pred_non_torch_gib": round(non_torch_gib, 4), + "pred_cuda_graph_gib": round(cuda_graph_gib, 4), + "pred_total_non_kv_cache_gib": round(total_non_kv_per_gpu, 4), + "pred_kv_cache_memory_gib": round(per_gpu_kv_gib, 4), + # Derived capacity predictions + "pred_kv_cache_tokens": kv_tokens, + "pred_max_concurrency": round(max_concurrency, 2), + # Totals used in formula + "pred_total_weight_gib": round(total_weight_gib, 4), + "pred_alloc_kv_total_gib": round(alloc_kv_total, 4), + } + + +COLUMNS = [ + "model", "gpu", "tp", "pp", "dp", "max_model_len", + "dtype", "quantization", "kv_cache_dtype", "gpu_memory_utilization", + "architecture", "attention_type", + "num_hidden_layers", "num_kv_heads", "head_dimension", + "kv_dtype_bytes", "per_token_kv_bytes", "per_token_kv_bytes_per_gpu", + "pred_weight_memory_gib", + "pred_activation_memory_gib", + "pred_non_torch_gib", + "pred_cuda_graph_gib", + "pred_total_non_kv_cache_gib", + "pred_kv_cache_memory_gib", + "pred_kv_cache_tokens", + "pred_max_concurrency", + "pred_total_weight_gib", + "pred_alloc_kv_total_gib", +] + + +def main(): + raw_csv = REPO_ROOT / "accuracy/results/v0.19.0/results_raw.csv" + out_csv = REPO_ROOT / "accuracy/results/v0.19.0/results_predicted.csv" + + rows = [r for r in csv.DictReader(raw_csv.open()) if r["status"] == "ok"] + print(f"Processing {len(rows)} successful rows (H100, gpu_memory={GPU_MEMORY_GIB} GiB)\n") + + results = [] + failed = [] + for row in rows: + model = row["model"] + label = f"{model} tp={row['tp']} pp={row['pp']} len={row['max_model_len']}" + try: + pred = predict(row) + results.append(pred) + print(f" ✓ {label}") + print(f" weight={pred['pred_weight_memory_gib']} GiB " + f"activ={pred['pred_activation_memory_gib']} GiB " + f"kv={pred['pred_kv_cache_memory_gib']} GiB") + except Exception as e: + failed.append((label, str(e))) + print(f" ✗ {label}: {e}") + if "--verbose" in sys.argv: + traceback.print_exc() + + with out_csv.open("w", newline="") as f: + writer = csv.DictWriter(f, fieldnames=COLUMNS, extrasaction="ignore") + writer.writeheader() + writer.writerows(results) + + print(f"\nWrote {len(results)} rows → {out_csv}") + if failed: + print(f"\nFailed ({len(failed)}):") + for label, err in failed: + print(f" {label}: {err}") + + +if __name__ == "__main__": + main() diff --git a/accuracy/scripts/sweep.yaml b/accuracy/scripts/sweep.yaml index 1a6b96b7..7316a019 100644 --- a/accuracy/scripts/sweep.yaml +++ b/accuracy/scripts/sweep.yaml @@ -21,103 +21,214 @@ defaults: nvidia.com/gpu.product: NVIDIA-H100-80GB-HBM3 runs: - # ── Kimi models (moonshotai) ────────────────────────────────────────────── - - model: moonshotai/Kimi-VL-A3B-Instruct # 16B total, 3B active MoE, vision-language; num_attention_heads=16 - tp: [1, 2] - trust_remote_code: true + # ── Core model coverage ─────────────────────────────────────────────────── + # One run per model at the minimum feasible TP. TP sensitivity is captured + # separately in the "Argument sensitivity: tensor parallelism" section below. - # Kimi-K2-Instruct (1T) and Kimi-K2.6 (1.1T) removed — OOM on 8x H100 80GB (~640 GB < ~1 TB weights in FP8) + # DONE: moonshotai/Kimi-VL-A3B-Instruct tp=1 + # - model: moonshotai/Kimi-VL-A3B-Instruct # 16B total, 3B active MoE, vision-language + # tp: 1 + # trust_remote_code: true - - model: moonshotai/Kimi-Dev-72B # 72B dense, Qwen2 architecture; num_attention_heads=64 - tp: [2, 4] # tp=8 skipped — no 8-GPU nodes available - trust_remote_code: true + # DONE: moonshotai/Kimi-Dev-72B tp=2 + # - model: moonshotai/Kimi-Dev-72B # 72B dense, Qwen2 architecture + # tp: 2 # tp=1 OOM: ~144 GiB weights exceed single H100 80GB + # trust_remote_code: true - # ── Argument sensitivity: data parallelism ──────────────────────────────── - - model: meta-llama/Llama-3.1-8B-Instruct + - model: codellama/CodeLlama-7b-hf # 7B dense; LlamaForCausalLM architecture tp: 1 - dp: [1, 2] - _sweep_dim: dp - # ── Argument sensitivity: --dtype ───────────────────────────────────────── - # Hold kv_cache_dtype fixed to avoid confounding KV cache precision. - # float32 is included to quantify the known planner gap: the planner reads - # the safetensors storage dtype (bf16) and never consults the --dtype flag, - # so fp32 will cause ~2× weight under-prediction and inflated KV estimate. - - model: meta-llama/Llama-3.1-8B-Instruct + - model: codellama/CodeLlama-34b-hf # 34B dense; ~65 GiB bf16; tp=1 leaves ~11 GiB KV headroom + tp: 1 # if OOM, retry with tp=2 + + - model: deepseek-ai/DeepSeek-V2-Lite-Chat # 16B total, 2.4B active MoE; DeepSeekV2 arch tp: 1 - dtype: [float16, bfloat16, float32] - kv_cache_dtype: auto - _sweep_dim: dtype - # ── Argument sensitivity: --quantization ────────────────────────────────── - - model: meta-llama/Llama-3.1-8B-Instruct # FP16 baseline (no quantization) + - model: ibm-granite/granite-3.1-2b-instruct tp: 1 - dtype: float16 - kv_cache_dtype: auto - quantization: null - _sweep_dim: quantization + _label: granite-3-1-2b - - model: RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8 # FP8 weights (compressed-tensors format, auto-detected) + - model: ibm-granite/granite-3.1-8b-instruct tp: 1 - dtype: float16 - kv_cache_dtype: auto - quantization: null - _label: w8a8-redhatai-llama-3-1-8b - _sweep_dim: quantization + _label: granite-3-1-8b - - model: RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16 # INT4 weights (compressed-tensors format, auto-detected) + - model: ibm-granite/granite-3.3-8b-instruct tp: 1 - dtype: float16 - kv_cache_dtype: auto - quantization: null - _label: w4a16-redhatai-llama-3-1-8b - _sweep_dim: quantization - # ── Extended quantization coverage ─────────────────────────────────────── - # Medium model (24B) w8a8: maps out where the activation-constant error becomes - # significant — between 8B (error negligible) and 70B (error dominant at TP=1). - - model: RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8 - tp: [1, 2] - _label: w8a8-mistral-small-24b - _sweep_dim: quantization + - model: ibm-granite/granite-vision-3.3-2b # vision-language; GraniteSpeechEncoderModel arch + tp: 1 - # Large model fp8-dynamic: tests a different quantization format from w8a8. - # fp8 weights are 1 byte/param but activations may differ from int8. - # TP=1 skipped — ~65 GiB fp8 weights + overhead leaves <5 GiB KV on a single H100. - - model: RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic - tp: [2, 4] - _label: fp8dyn-llama-3-3-70b - _sweep_dim: quantization + - model: microsoft/phi-4 # 14B dense; Phi3 architecture + tp: 1 - # Qwen2.5-7B quantized: compare against existing unquantized Qwen2.5-7B baseline. - # Both formats on the same small model isolate quantization effect from size/arch. - - model: RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8 + - model: mistralai/Mistral-Small-3.1-24B-Instruct-2503 # 24B dense; Mistral3 architecture tp: 1 - _label: w8a8-redhatai-qwen2-5-7b - _sweep_dim: quantization - - model: RedHatAI/Qwen2.5-7B-Instruct-fp8-dynamic + - model: mistralai/Mixtral-8x7B-Instruct-v0.1 # 56B total, 14B active MoE + tp: 2 # tp=1 OOM: ~87 GiB weights exceed single H100 80GB + + - model: openai/gpt-oss-20b # 20B dense; tp=1 OOM during CUDA graph warmup + tp: 2 + # gpt-oss-120b skipped — OOM or infra unavailability at all tested tp values + + - model: Qwen/Qwen2.5-7B-Instruct # 7B dense; reference model for sensitivity sweeps tp: 1 - _label: fp8dyn-redhatai-qwen2-5-7b - _sweep_dim: quantization - # ── Argument sensitivity: --kv-cache-dtype ──────────────────────────────── - # FP8 KV cache halves per-token bytes → GPU block count should ~2×. - # FP8 KV cache requires bfloat16 compute dtype in vLLM v0.19.0 + - model: Qwen/Qwen2.5-72B-Instruct # 72B dense + tp: 2 # tp=1 OOM: ~144 GiB weights exceed single H100 80GB + + - model: Qwen/Qwen3-8B # 8B dense; Qwen3 architecture + tp: 1 + + - model: Qwen/Qwen3-30B-A3B # 30B total, 3B active MoE; Qwen3Moe architecture + tp: 1 + + - model: meta-llama/Llama-4-Scout-17B-16E-Instruct # 109B total, 17B active MoE (16 experts) + tp: 4 # tp=1 OOM (~212 GiB total), tp=2 OOM (~106 GiB total) + + # ── Gemma models ────────────────────────────────────────────────────────── + # BLOCKED: HF token needs access granted at https://huggingface.co/google/gemma-2-2b-it + # (GatedRepoError 403 — all google/gemma-* repos require explicit access approval) + # - model: google/gemma-2-2b-it # 2.6B dense; Gemma2 architecture + # tp: 1 + # - model: google/gemma-2-9b-it # 9.2B dense; Gemma2 architecture + # tp: 1 + # - model: google/gemma-2-27b-it # 27.2B dense; fits H100 80GB at bf16 (~54 GiB weights) + # tp: 1 + # - model: google/gemma-3-4b-it # 4B dense; Gemma3 architecture + # tp: 1 + # - model: google/gemma-3-12b-it # 12B dense; Gemma3 architecture + # tp: 1 + # - model: google/gemma-3-27b-it # 27B dense; Gemma3 architecture + # tp: 1 + + # ── Kimi Dev 72B TP sensitivity (retry tp=4; tp=2 succeeded) ───────────── + - model: moonshotai/Kimi-Dev-72B + tp: 4 + trust_remote_code: true + _sweep_dim: tp + + # ── Argument sensitivity: tensor parallelism ───────────────────────────── + # Llama-3.1-8B has 32 attention heads; tp=3 is invalid (32 % 3 ≠ 0). + - model: meta-llama/Llama-3.1-8B-Instruct + tp: [1, 2, 4] + _sweep_dim: tp + + - model: Qwen/Qwen2.5-7B-Instruct # 28 attention heads; valid tp: 1, 2, 4, 7, 14, 28 + tp: [1, 2, 4] + _sweep_dim: tp + + # ── Argument sensitivity: data parallelism ──────────────────────────────── + - model: meta-llama/Llama-3.1-8B-Instruct + tp: 1 + dp: [1, 2] + _sweep_dim: dp + + # ── Argument sensitivity: pipeline parallelism ─────────────────────────── + - model: meta-llama/Llama-3.1-8B-Instruct + tp: 1 + pp: [2, 4] + _sweep_dim: pp + + # ── Argument sensitivity: --max-model-len ──────────────────────────────── - model: meta-llama/Llama-3.1-8B-Instruct tp: 1 - dtype: bfloat16 - kv_cache_dtype: [auto, fp8] - _sweep_dim: kv_cache_dtype + max_model_len: [2048, 4096, 8192, 16384, 32768] + _sweep_dim: max_model_len - model: Qwen/Qwen2.5-7B-Instruct tp: 1 - dtype: bfloat16 - kv_cache_dtype: [auto, fp8] - _sweep_dim: kv_cache_dtype + max_model_len: [2048, 4096, 8192, 16384, 32768] + _sweep_dim: max_model_len + + # ── Argument sensitivity: --dtype ───────────────────────────────────────── + # DONE: all three dtype values completed + # - model: meta-llama/Llama-3.1-8B-Instruct + # tp: 1 + # dtype: [float16, bfloat16, float32] + # kv_cache_dtype: auto + # _sweep_dim: dtype + + # ── Argument sensitivity: --kv-cache-dtype ──────────────────────────────── + # DONE: both auto and fp8 completed + # - model: meta-llama/Llama-3.1-8B-Instruct + # tp: 1 + # dtype: bfloat16 + # kv_cache_dtype: [auto, fp8] + # _sweep_dim: kv_cache_dtype + + # DONE: both auto and fp8 completed + # - model: Qwen/Qwen2.5-7B-Instruct + # tp: 1 + # dtype: bfloat16 + # kv_cache_dtype: [auto, fp8] + # _sweep_dim: kv_cache_dtype # ── Argument sensitivity: non-power-of-2 tp ────────────────────────────── - # Qwen3-14B: num_attention_heads=40, so tp=5 is valid (40 % 5 == 0). - - model: Qwen/Qwen3-14B - tp: 5 - _sweep_dim: tp_odd + # INVALID: Qwen3-14B vocab_size=151936 is not divisible by 5 (vLLM enforces + # vocab sharding). num_attention_heads=40 supports tp=5, but vocab does not. + # Valid tp values: divisors of gcd(40, 151936)=8 → {1,2,4,8}. + # - model: Qwen/Qwen3-14B + # tp: 5 + # _sweep_dim: tp_odd + + # ── Argument sensitivity: --quantization ────────────────────────────────── + # DONE: FP16 baseline (same run_id as dtype=float16 above) + # - model: meta-llama/Llama-3.1-8B-Instruct # FP16 baseline + # tp: 1 + # dtype: float16 + # kv_cache_dtype: auto + # quantization: null + # _sweep_dim: quantization + + # runtime fp8 quantization — not yet done + - model: meta-llama/Llama-3.1-8B-Instruct + tp: 1 + quantization: fp8 + _sweep_dim: quantization + + # DONE: w8a8 + # - model: RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8 + # tp: 1 + # dtype: float16 + # kv_cache_dtype: auto + # quantization: null + # _label: w8a8-redhatai-llama-3-1-8b + # _sweep_dim: quantization + + # DONE: w4a16 + # - model: RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16 + # tp: 1 + # dtype: float16 + # kv_cache_dtype: auto + # quantization: null + # _label: w4a16-redhatai-llama-3-1-8b + # _sweep_dim: quantization + + # DONE: w8a8 Mistral (tp=1 and tp=2) + # - model: RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8 + # tp: 1 + # _label: w8a8-mistral-small-24b + # _sweep_dim: quantization + + # DONE: fp8-dynamic (tp=2 and tp=4) + # - model: RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic + # tp: 2 # tp=1 OOM: ~65 GiB fp8 weights leave <5 GiB KV on single H100 + # _label: fp8dyn-llama-3-3-70b + # _sweep_dim: quantization + + - model: redhatai/Llama-3.3-70B-Instruct-quantized.w8a8 + tp: 2 # tp=1 OOM for same reason + _sweep_dim: quantization + + # DONE: w8a8 Qwen2.5 7B + # - model: RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8 + # tp: 1 + # _label: w8a8-redhatai-qwen2-5-7b + # _sweep_dim: quantization + + # DONE: fp8-dynamic Qwen2.5 7B + # - model: RedHatAI/Qwen2.5-7B-Instruct-fp8-dynamic + # tp: 1 + # _label: fp8dyn-redhatai-qwen2-5-7b + # _sweep_dim: quantization diff --git a/accuracy/scripts/sweep_runner.py b/accuracy/scripts/sweep_runner.py index b8ca49b4..9edab422 100644 --- a/accuracy/scripts/sweep_runner.py +++ b/accuracy/scripts/sweep_runner.py @@ -126,6 +126,7 @@ def _build_job_manifest(run_id: str, run: dict[str, Any]) -> dict[str, Any]: {"name": "XDG_CACHE_HOME", "value": "/data/.cache"}, {"name": "FLASHINFER_WORKSPACE_DIR", "value": "/data/.cache/flashinfer"}, {"name": "VLLM_ATTENTION_BACKEND", "value": "FLASH_ATTN"}, + {"name": "VLLM_LOGGING_LEVEL", "value": "DEBUG"}, ], "resources": { "limits": {"nvidia.com/gpu": num_gpus}, @@ -156,6 +157,13 @@ def _submit_sub_job(batch_api: Any, run_id: str, run: dict[str, Any]) -> None: print(f" Submitted: vllm-mem-{run_id}", flush=True) +class PodFailedError(RuntimeError): + """Raised when a vLLM pod terminates with a non-zero exit or phase=Failed.""" + def __init__(self, pod_name: str, message: str) -> None: + super().__init__(message) + self.pod_name = pod_name + + def _wait_for_pod_ready(core_api: Any, run_id: str, namespace: str, timeout: int = 2400) -> str: """Block until the pod's startupProbe passes. Returns the pod name.""" @@ -168,29 +176,47 @@ def _wait_for_pod_ready(core_api: Any, run_id: str, namespace: str, label_selector=label_sel, timeout_seconds=timeout): pod = event["object"] + pod_name = pod.metadata.name # Detect terminal failure immediately rather than waiting for timeout if pod.status.phase == "Failed": w.stop() - raise RuntimeError(f"Pod {pod.metadata.name} failed (phase=Failed)") + raise PodFailedError(pod_name, f"Pod {pod_name} failed (phase=Failed)") if pod.status.container_statuses: for cs in pod.status.container_statuses: if cs.ready: w.stop() - print(f" Pod ready: {pod.metadata.name}", flush=True) - return pod.metadata.name + print(f" Pod ready: {pod_name}", flush=True) + return pod_name # Terminated with non-zero exit = OOM or crash if cs.state and cs.state.terminated and cs.state.terminated.exit_code != 0: w.stop() reason = cs.state.terminated.reason or "unknown" - raise RuntimeError( - f"Pod {pod.metadata.name} terminated: {reason} " + raise PodFailedError( + pod_name, + f"Pod {pod_name} terminated: {reason} " f"(exit {cs.state.terminated.exit_code})" ) raise TimeoutError(f"Pod for run-id={run_id} did not become ready within {timeout}s") -def _fetch_pod_log(core_api: Any, pod_name: str, namespace: str) -> str: - return core_api.read_namespaced_pod_log(name=pod_name, namespace=namespace) +def _fetch_pod_log(core_api: Any, pod_name: str, namespace: str, + previous: bool = False) -> str: + return core_api.read_namespaced_pod_log( + name=pod_name, namespace=namespace, previous=previous + ) + + +def _cleanup_stale_job(batch_api: Any, run_id: str, namespace: str) -> None: + """Delete a leftover job from a previous crashed sweep before re-submitting.""" + from kubernetes import client + try: + batch_api.delete_namespaced_job( + name=f"vllm-mem-{run_id}", namespace=namespace, + body=client.V1DeleteOptions(propagation_policy="Background"), + ) + print(" Cleaned up stale job before submit.", flush=True) + except Exception: + pass # Job doesn't exist — expected on first run def _delete_job(batch_api: Any, run_id: str, namespace: str) -> None: @@ -220,6 +246,9 @@ def run_sweep(runs: list[dict[str, Any]], results_dir: Path) -> None: logs_dir.mkdir(parents=True, exist_ok=True) runs_dir.mkdir(parents=True, exist_ok=True) + successes = 0 + failures = 0 + for i, run in enumerate(runs, 1): run_id = make_run_id(run) json_path = runs_dir / f"{run_id}.json" @@ -240,8 +269,8 @@ def run_sweep(runs: list[dict[str, Any]], results_dir: Path) -> None: print(" Skipping — result already exists.", flush=True) continue - pod_name_on_fail: str | None = None try: + _cleanup_stale_job(batch_api, run_id, namespace) _submit_sub_job(batch_api, run_id, run) pod_name = _wait_for_pod_ready(core_api, run_id, namespace) log_text = _fetch_pod_log(core_api, pod_name, namespace) @@ -249,19 +278,34 @@ def run_sweep(runs: list[dict[str, Any]], results_dir: Path) -> None: print(f" Log saved: {log_path}", flush=True) except Exception as e: print(f" Run failed: {e}", flush=True) - # Try to save failure log so the error is visible after pod deletion + # Save failure log before the job is deleted in the finally block. + # PodFailedError carries the pod name directly; other failures + # (e.g. TimeoutError) require a pod list to find it. try: - pods = core_api.list_namespaced_pod( - namespace=namespace, label_selector=f"run-id={run_id}" - ) - if pods.items: - fail_pod = pods.items[0].metadata.name - fail_log = _fetch_pod_log(core_api, fail_pod, namespace) + if isinstance(e, PodFailedError): + fail_pod = e.pod_name + else: + pods = core_api.list_namespaced_pod( + namespace=namespace, label_selector=f"run-id={run_id}" + ) + fail_pod = pods.items[0].metadata.name if pods.items else None + if fail_pod: + # For terminated containers use previous=True to get the + # crashed container's output rather than an empty current log. + terminated = isinstance(e, PodFailedError) + try: + fail_log = _fetch_pod_log(core_api, fail_pod, namespace, + previous=terminated) + except Exception: + fail_log = _fetch_pod_log(core_api, fail_pod, namespace) fail_path = logs_dir / f"{run_id}.FAILED.log" fail_path.write_text(fail_log) print(f" Failure log saved: {fail_path}", flush=True) + else: + print(" No pod found — failure log unavailable.", flush=True) except Exception as log_err: print(f" Could not save failure log: {log_err}", flush=True) + failures += 1 continue finally: try: @@ -271,35 +315,41 @@ def run_sweep(runs: list[dict[str, Any]], results_dir: Path) -> None: try: parsed = pl.parse(log_path) - except ValueError as e: + except Exception as e: print(f" Parse failed: {e}", flush=True) + failures += 1 continue - record = { - "model": run["model"], - "gpu": run["gpu"], - "vllm_args": { - "tensor_parallel_size": run["tp"], - "pipeline_parallel_size": run["pp"], - "data_parallel_size": run["dp"], - "max_model_len": run["max_model_len"], - "gpu_memory_utilization": float(run["gpu_memory_utilization"]), - "dtype": run.get("dtype", "auto"), - "quantization": run.get("quantization"), - "kv_cache_dtype": run.get("kv_cache_dtype", "auto"), - }, - "timestamp": datetime.now(timezone.utc).isoformat(), - "log_path": str(log_path), - **parsed, - # planner_predicted and error_pct are added in a separate calibration step - } - if "_sweep_dim" in run: - record["_sweep_dim"] = run["_sweep_dim"] - - json_path.write_text(json.dumps(record, indent=2)) - print(f" JSON saved: {json_path}", flush=True) - - print(f"\nSweep complete. Results in {results_dir}", flush=True) + try: + record = { + "model": run["model"], + "gpu": run["gpu"], + "vllm_args": { + "tensor_parallel_size": run["tp"], + "pipeline_parallel_size": run["pp"], + "data_parallel_size": run["dp"], + "max_model_len": run["max_model_len"], + "gpu_memory_utilization": float(run["gpu_memory_utilization"]), + "dtype": run.get("dtype", "auto"), + "quantization": run.get("quantization"), + "kv_cache_dtype": run.get("kv_cache_dtype", "auto"), + }, + "timestamp": datetime.now(timezone.utc).isoformat(), + "log_path": str(log_path), + **parsed, + # planner_predicted and error_pct are added in a separate calibration step + } + if "_sweep_dim" in run: + record["_sweep_dim"] = run["_sweep_dim"] + + json_path.write_text(json.dumps(record, indent=2)) + print(f" JSON saved: {json_path}", flush=True) + successes += 1 + except Exception as e: + print(f" Failed to save result: {e}", flush=True) + failures += 1 + + print(f"\nSweep complete. {successes} succeeded, {failures} failed. Results in {results_dir}", flush=True) # ── Version extraction ──────────────────────────────────────────────────────── From 2c89f478723aead65c9a3521746dce98b5b76cf7 Mon Sep 17 00:00:00 2001 From: Jing Chen Date: Wed, 22 Apr 2026 11:23:45 -0400 Subject: [PATCH 03/24] Update models list Signed-off-by: Jing Chen --- accuracy/k8s/orchestrator-job-codellama.yaml | 46 +++++ accuracy/results/v0.19.0/report.md | 180 ------------------- accuracy/results/v0.19.0/run_matrix.md | 76 ++++++++ accuracy/scripts/sweep-codellama.yaml | 21 +++ accuracy/scripts/sweep.yaml | 4 +- 5 files changed, 145 insertions(+), 182 deletions(-) create mode 100644 accuracy/k8s/orchestrator-job-codellama.yaml delete mode 100644 accuracy/results/v0.19.0/report.md create mode 100644 accuracy/results/v0.19.0/run_matrix.md create mode 100644 accuracy/scripts/sweep-codellama.yaml diff --git a/accuracy/k8s/orchestrator-job-codellama.yaml b/accuracy/k8s/orchestrator-job-codellama.yaml new file mode 100644 index 00000000..c6b8917b --- /dev/null +++ b/accuracy/k8s/orchestrator-job-codellama.yaml @@ -0,0 +1,46 @@ +# Submit with: kubectl apply -f accuracy/k8s/orchestrator-job-codellama.yaml +# Monitor with: kubectl logs -f job/vllm-mem-orchestrator-codellama -n llmdplanner +apiVersion: batch/v1 +kind: Job +metadata: + name: vllm-mem-orchestrator-codellama + namespace: llmdplanner +spec: + backoffLimit: 0 + activeDeadlineSeconds: 7200 # 2-hour cap for 2-model run + template: + spec: + serviceAccountName: vllm-mem-orchestrator + restartPolicy: Never + volumes: + - name: data + persistentVolumeClaim: + claimName: vllm-mem-data + - name: scripts + configMap: + name: vllm-mem-scripts + defaultMode: 0755 + - name: sweep + configMap: + name: vllm-mem-sweep-codellama + containers: + - name: orchestrator + image: python:3.11-slim + command: ["/bin/bash", "-c"] + args: + - | + pip install pyyaml kubernetes --quiet --no-cache-dir && + python /scripts/sweep_runner.py \ + --config /sweep/sweep-codellama.yaml \ + --results /data/results/ + volumeMounts: + - name: data + mountPath: /data + - name: scripts + mountPath: /scripts + - name: sweep + mountPath: /sweep + resources: + requests: + cpu: "500m" + memory: "512Mi" diff --git a/accuracy/results/v0.19.0/report.md b/accuracy/results/v0.19.0/report.md deleted file mode 100644 index b96636c5..00000000 --- a/accuracy/results/v0.19.0/report.md +++ /dev/null @@ -1,180 +0,0 @@ -# Memory Validation Report - -## Per-component error - -| Model | TP | PP | DP | max_len | Weight | Activation | Non-torch | KV cache | -|---|---|---|---|---|---|---|---|---| -| deepseek-ai/DeepSeek-V2-Lite-Chat | 1 | 1 | 1 | 8192 | -0.6% | +314.5% | -42.3% | -11.5% | -| RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic | 2 | 1 | 1 | 8192 | -0.1% | +144.9% | -71.4% | +5.0% | -| RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic | 4 | 1 | 1 | 8192 | -0.2% | +143.7% | -72.9% | +5.9% | -| RedHatAI/Qwen2.5-7B-Instruct-fp8-dynamic | 1 | 1 | 1 | 8192 | -0.4% | +153.4% | -37.5% | -3.9% | -| ibm-granite/granite-3.1-2b-instruct | 1 | 1 | 1 | 8192 | -0.4% | +633.3% | -67.4% | -5.3% | -| ibm-granite/granite-3.1-8b-instruct | 1 | 1 | 1 | 8192 | -0.2% | +547.1% | -67.4% | -6.0% | -| ibm-granite/granite-3.3-8b-instruct | 1 | 1 | 1 | 8192 | -0.2% | +547.1% | -67.4% | -6.0% | -| ibm-granite/granite-vision-3.3-2b | 1 | 1 | 1 | 8192 | 0.0% | +216.5% | -40.0% | -1.2% | -| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 8192 | -0.2% | +154.0% | -40.0% | -3.5% | -| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 8192 | -0.2% | +154.0% | -40.0% | -3.5% | -| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 8192 | +76.2% | +154.0% | -40.0% | -13.2% | -| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 8192 | -0.2% | +154.0% | -40.0% | -3.5% | -| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 8192 | -50.1% | +117.2% | -40.0% | +31.1% | -| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 2048 | -0.2% | +154.0% | -40.0% | -3.5% | -| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 4096 | -0.2% | +154.0% | -40.0% | -3.5% | -| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 8192 | -0.2% | +154.0% | -40.0% | -3.5% | -| meta-llama/Llama-3.1-8B-Instruct | 1 | 2 | 1 | 8192 | -0.4% | +336.4% | +114.3% | -0.9% | -| meta-llama/Llama-3.1-8B-Instruct | 1 | 4 | 1 | 8192 | -12.2% | +357.1% | +114.3% | +1.6% | -| meta-llama/Llama-3.1-8B-Instruct | 2 | 1 | 1 | 8192 | -0.4% | +154.0% | -71.0% | +2.8% | -| meta-llama/Llama-3.1-8B-Instruct | 4 | 1 | 1 | 8192 | -0.8% | +154.0% | -71.8% | +4.5% | -| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 16384 | -0.2% | +154.0% | -40.0% | -3.5% | -| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 32768 | -0.2% | +154.0% | -40.0% | -3.5% | -| microsoft/phi-4 | 1 | 1 | 1 | 8192 | -0.3% | +261.8% | -40.0% | -6.6% | -| mistralai/Mistral-Small-3.1-24B-Instruct-2503 | 1 | 1 | 1 | 8192 | -0.1% | +23.2% | -40.0% | +1.6% | -| mistralai/Mixtral-8x7B-Instruct-v0.1 | 2 | 1 | 1 | 8192 | -0.0% | +561.2% | -71.0% | -1.9% | -| moonshotai/Kimi-Dev-72B | 2 | 1 | 1 | 8192 | -0.2% | +144.5% | -71.3% | +61.9% | -| moonshotai/Kimi-Dev-72B | 4 | 1 | 1 | 8192 | -0.4% | +144.5% | -72.9% | +9.3% | -| moonshotai/Kimi-VL-A3B-Instruct | 1 | 1 | 1 | 8192 | -0.6% | +174.0% | -40.0% | -9.8% | -| moonshotai/Kimi-VL-A3B-Instruct | 2 | 1 | 1 | 8192 | -1.6% | +180.7% | -71.0% | +2.4% | -| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 8192 | -0.4% | +153.4% | -37.5% | -4.2% | -| Qwen/Qwen2.5-72B-Instruct | 2 | 1 | 1 | 8192 | -0.1% | +144.5% | -71.3% | +60.9% | -| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 8192 | -0.4% | +153.4% | -37.5% | -4.2% | -| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 16384 | -0.4% | +153.4% | -37.5% | -4.2% | -| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 32768 | -0.4% | +153.4% | -37.5% | -4.2% | -| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 2048 | -0.4% | +153.4% | -37.5% | -4.2% | -| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 4096 | -0.4% | +153.4% | -37.5% | -4.2% | -| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 8192 | -0.4% | +153.4% | -37.5% | -4.2% | -| Qwen/Qwen2.5-7B-Instruct | 2 | 1 | 1 | 8192 | -0.4% | +153.4% | -70.9% | +2.6% | -| Qwen/Qwen2.5-7B-Instruct | 4 | 1 | 1 | 8192 | 0.0% | +153.4% | -71.8% | +4.6% | -| Qwen/Qwen3-30B-A3B | 1 | 1 | 1 | 8192 | -0.0% | +198.5% | -44.4% | -28.7% | -| Qwen/Qwen3-8B | 1 | 1 | 1 | 8192 | -0.1% | +153.4% | -40.0% | -4.4% | -| redhatai/Llama-3.3-70B-Instruct-quantized.w8a8 | 2 | 1 | 1 | 8192 | -0.1% | +144.9% | -71.6% | +5.0% | -| RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16 | 1 | 1 | 1 | 8192 | -0.7% | +154.0% | -40.0% | -3.0% | -| RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8 | 1 | 1 | 1 | 8192 | -0.2% | +14.7% | -42.3% | +1.2% | -| RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8 | 2 | 1 | 1 | 8192 | -0.8% | +23.2% | -71.0% | +5.3% | -| RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8 | 1 | 1 | 1 | 8192 | -0.4% | +154.0% | -40.0% | -3.1% | -| RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8 | 1 | 1 | 1 | 8192 | -0.4% | +153.4% | -40.0% | -3.9% | - -## Per-architecture error - -_Group by architecture class. Mean and max absolute error per component._ - -## Argument sensitivity - -### max_model_len sweep - -| Value | Weight | Activation | Non-torch | KV cache | -|---|---|---|---|---| -| 2048 | -0.2% | +154.0% | -40.0% | -3.5% | -| 4096 | -0.2% | +154.0% | -40.0% | -3.5% | -| 16384 | -0.2% | +154.0% | -40.0% | -3.5% | -| 32768 | -0.2% | +154.0% | -40.0% | -3.5% | -| 16384 | -0.4% | +153.4% | -37.5% | -4.2% | -| 32768 | -0.4% | +153.4% | -37.5% | -4.2% | -| 2048 | -0.4% | +153.4% | -37.5% | -4.2% | -| 4096 | -0.4% | +153.4% | -37.5% | -4.2% | - -### tp sweep - -| Value | Weight | Activation | Non-torch | KV cache | -|---|---|---|---|---| -| 2 | -0.4% | +154.0% | -71.0% | +2.8% | -| 4 | -0.8% | +154.0% | -71.8% | +4.5% | -| 4 | -0.4% | +144.5% | -72.9% | +9.3% | -| 2 | -0.4% | +153.4% | -70.9% | +2.6% | -| 4 | 0.0% | +153.4% | -71.8% | +4.6% | - -### pp sweep - -| Value | Weight | Activation | Non-torch | KV cache | -|---|---|---|---|---| -| 2 | -0.4% | +336.4% | +114.3% | -0.9% | -| 4 | -12.2% | +357.1% | +114.3% | +1.6% | - -### dp sweep - -| Value | Weight | Activation | Non-torch | KV cache | -|---|---|---|---|---| -| 1 | -0.2% | +154.0% | -40.0% | -3.5% | - -### dtype sweep - -| Value | Weight | Activation | Non-torch | KV cache | -|---|---|---|---|---| -| bfloat16 | -0.2% | +154.0% | -40.0% | -3.5% | -| float16 | -0.2% | +154.0% | -40.0% | -3.5% | -| float32 | -50.1% | +117.2% | -40.0% | +31.1% | - -### quantization sweep - -| Value | Weight | Activation | Non-torch | KV cache | -|---|---|---|---|---| -| None | -0.1% | +144.9% | -71.4% | +5.0% | -| None | -0.2% | +143.7% | -72.9% | +5.9% | -| None | -0.4% | +153.4% | -37.5% | -3.9% | -| fp8 | +76.2% | +154.0% | -40.0% | -13.2% | -| None | -0.1% | +144.9% | -71.6% | +5.0% | -| None | -0.7% | +154.0% | -40.0% | -3.0% | -| None | -0.2% | +14.7% | -42.3% | +1.2% | -| None | -0.8% | +23.2% | -71.0% | +5.3% | -| None | -0.4% | +154.0% | -40.0% | -3.1% | -| None | -0.4% | +153.4% | -40.0% | -3.9% | - -### kv_cache_dtype sweep - -| Value | Weight | Activation | Non-torch | KV cache | -|---|---|---|---|---| -| fp8 | -0.2% | +154.0% | -40.0% | -3.5% | -| fp8 | -0.4% | +153.4% | -37.5% | -4.2% | -| auto | -0.4% | +153.4% | -37.5% | -4.2% | - -## Outliers - -- **deepseek-ai/DeepSeek-V2-Lite-Chat** (TP=1): {'activation_memory': 314.5077720207254, 'non_torch_memory': -42.307692307692314, 'kv_cache': -11.511121302453557} — root cause required -- **RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic** (TP=2): {'activation_memory': 144.89795918367346, 'non_torch_memory': -71.42857142857143} — root cause required -- **RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic** (TP=4): {'activation_memory': 143.6548223350254, 'non_torch_memory': -72.85067873303167} — root cause required -- **RedHatAI/Qwen2.5-7B-Instruct-fp8-dynamic** (TP=1): {'activation_memory': 153.39366515837102, 'non_torch_memory': -37.5} — root cause required -- **ibm-granite/granite-3.1-2b-instruct** (TP=1): {'activation_memory': 633.3333333333333, 'non_torch_memory': -67.3913043478261} — root cause required -- **ibm-granite/granite-3.1-8b-instruct** (TP=1): {'activation_memory': 547.0588235294118, 'non_torch_memory': -67.3913043478261} — root cause required -- **ibm-granite/granite-3.3-8b-instruct** (TP=1): {'activation_memory': 547.0588235294118, 'non_torch_memory': -67.3913043478261} — root cause required -- **ibm-granite/granite-vision-3.3-2b** (TP=1): {'activation_memory': 216.45569620253164, 'non_torch_memory': -40.0} — root cause required -- **meta-llama/Llama-3.1-8B-Instruct** (TP=1): {'activation_memory': 153.96825396825398, 'non_torch_memory': -40.0} — root cause required -- **meta-llama/Llama-3.1-8B-Instruct** (TP=1): {'activation_memory': 153.96825396825398, 'non_torch_memory': -40.0} — root cause required -- **meta-llama/Llama-3.1-8B-Instruct** (TP=1): {'weight_memory': 76.2073027090695, 'activation_memory': 153.96825396825398, 'non_torch_memory': -40.0, 'kv_cache': -13.18681318681318} — root cause required -- **meta-llama/Llama-3.1-8B-Instruct** (TP=1): {'activation_memory': 153.96825396825398, 'non_torch_memory': -40.0} — root cause required -- **meta-llama/Llama-3.1-8B-Instruct** (TP=1): {'weight_memory': -50.100066711140755, 'activation_memory': 117.19457013574662, 'non_torch_memory': -40.0, 'kv_cache': 31.051401869158894} — root cause required -- **meta-llama/Llama-3.1-8B-Instruct** (TP=1): {'activation_memory': 153.96825396825398, 'non_torch_memory': -40.0} — root cause required -- **meta-llama/Llama-3.1-8B-Instruct** (TP=1): {'activation_memory': 153.96825396825398, 'non_torch_memory': -40.0} — root cause required -- **meta-llama/Llama-3.1-8B-Instruct** (TP=1): {'activation_memory': 153.96825396825398, 'non_torch_memory': -40.0} — root cause required -- **meta-llama/Llama-3.1-8B-Instruct** (TP=1): {'activation_memory': 336.3636363636363, 'non_torch_memory': 114.28571428571426} — root cause required -- **meta-llama/Llama-3.1-8B-Instruct** (TP=1): {'weight_memory': -12.206572769953041, 'activation_memory': 357.1428571428571, 'non_torch_memory': 114.28571428571426} — root cause required -- **meta-llama/Llama-3.1-8B-Instruct** (TP=2): {'activation_memory': 153.96825396825398, 'non_torch_memory': -71.01449275362319} — root cause required -- **meta-llama/Llama-3.1-8B-Instruct** (TP=4): {'activation_memory': 153.96825396825398, 'non_torch_memory': -71.83098591549295} — root cause required -- **meta-llama/Llama-3.1-8B-Instruct** (TP=1): {'activation_memory': 153.96825396825398, 'non_torch_memory': -40.0} — root cause required -- **meta-llama/Llama-3.1-8B-Instruct** (TP=1): {'activation_memory': 153.96825396825398, 'non_torch_memory': -40.0} — root cause required -- **microsoft/phi-4** (TP=1): {'activation_memory': 261.84210526315786, 'non_torch_memory': -40.0} — root cause required -- **mistralai/Mistral-Small-3.1-24B-Instruct-2503** (TP=1): {'activation_memory': 23.15270935960592, 'non_torch_memory': -40.0} — root cause required -- **mistralai/Mixtral-8x7B-Instruct-v0.1** (TP=2): {'activation_memory': 561.1570247933885, 'non_torch_memory': -71.01449275362319} — root cause required -- **moonshotai/Kimi-Dev-72B** (TP=2): {'activation_memory': 144.54148471615719, 'non_torch_memory': -71.29186602870813, 'kv_cache': 61.920529801324484} — root cause required -- **moonshotai/Kimi-Dev-72B** (TP=4): {'activation_memory': 144.54148471615719, 'non_torch_memory': -72.85067873303167} — root cause required -- **moonshotai/Kimi-VL-A3B-Instruct** (TP=1): {'activation_memory': 173.97260273972603, 'non_torch_memory': -40.0} — root cause required -- **moonshotai/Kimi-VL-A3B-Instruct** (TP=2): {'activation_memory': 180.70175438596493, 'non_torch_memory': -71.01449275362319} — root cause required -- **Qwen/Qwen2.5-7B-Instruct** (TP=1): {'activation_memory': 153.39366515837102, 'non_torch_memory': -37.5} — root cause required -- **Qwen/Qwen2.5-72B-Instruct** (TP=2): {'activation_memory': 144.54148471615719, 'non_torch_memory': -71.29186602870813, 'kv_cache': 60.855263157894726} — root cause required -- **Qwen/Qwen2.5-7B-Instruct** (TP=1): {'activation_memory': 153.39366515837102, 'non_torch_memory': -37.5} — root cause required -- **Qwen/Qwen2.5-7B-Instruct** (TP=1): {'activation_memory': 153.39366515837102, 'non_torch_memory': -37.5} — root cause required -- **Qwen/Qwen2.5-7B-Instruct** (TP=1): {'activation_memory': 153.39366515837102, 'non_torch_memory': -37.5} — root cause required -- **Qwen/Qwen2.5-7B-Instruct** (TP=1): {'activation_memory': 153.39366515837102, 'non_torch_memory': -37.5} — root cause required -- **Qwen/Qwen2.5-7B-Instruct** (TP=1): {'activation_memory': 153.39366515837102, 'non_torch_memory': -37.5} — root cause required -- **Qwen/Qwen2.5-7B-Instruct** (TP=1): {'activation_memory': 153.39366515837102, 'non_torch_memory': -37.5} — root cause required -- **Qwen/Qwen2.5-7B-Instruct** (TP=2): {'activation_memory': 153.39366515837102, 'non_torch_memory': -70.87378640776699} — root cause required -- **Qwen/Qwen2.5-7B-Instruct** (TP=4): {'activation_memory': 153.39366515837102, 'non_torch_memory': -71.83098591549295} — root cause required -- **Qwen/Qwen3-30B-A3B** (TP=1): {'activation_memory': 198.50746268656715, 'non_torch_memory': -44.44444444444445, 'kv_cache': -28.747566515249833} — root cause required -- **Qwen/Qwen3-8B** (TP=1): {'activation_memory': 153.39366515837102, 'non_torch_memory': -40.0} — root cause required -- **redhatai/Llama-3.3-70B-Instruct-quantized.w8a8** (TP=2): {'activation_memory': 144.89795918367346, 'non_torch_memory': -71.56398104265402} — root cause required -- **RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16** (TP=1): {'activation_memory': 153.96825396825398, 'non_torch_memory': -40.0} — root cause required -- **RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8** (TP=1): {'activation_memory': 14.6788990825688, 'non_torch_memory': -42.307692307692314} — root cause required -- **RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8** (TP=2): {'activation_memory': 23.15270935960592, 'non_torch_memory': -71.01449275362319} — root cause required -- **RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8** (TP=1): {'activation_memory': 153.96825396825398, 'non_torch_memory': -40.0} — root cause required -- **RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8** (TP=1): {'activation_memory': 153.39366515837102, 'non_torch_memory': -40.0} — root cause required - -## Calibration decisions - -_Document constant changes here: old value → new value, evidence._ diff --git a/accuracy/results/v0.19.0/run_matrix.md b/accuracy/results/v0.19.0/run_matrix.md new file mode 100644 index 00000000..46bc15c1 --- /dev/null +++ b/accuracy/results/v0.19.0/run_matrix.md @@ -0,0 +1,76 @@ +# Run Matrix — vLLM v0.19.0 / H100-80GB + +**47 successful runs, 11 failed runs.** + +Quantization abbreviations: `ct` = compressed-tensors, `gptq` = gptq_marlin, `fp8` = fp8 inline, `—` = none. + +## Successful Runs + +| Model | TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight | Activation | Non-torch | KV cache | +|---|---|---|---|---|---|---|---|---|---|---|---| +| deepseek-ai/DeepSeek-V2-Lite-Chat | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.6% | +314.5% | -42.3% | -11.5% | +| RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic | 2 | 1 | 1 | 8192 | bf16 | ct | auto | -0.1% | +144.9% | -71.4% | +5.0% | +| RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic | 4 | 1 | 1 | 8192 | bf16 | ct | auto | -0.2% | +143.7% | -72.9% | +5.9% | +| RedHatAI/Qwen2.5-7B-Instruct-fp8-dynamic | 1 | 1 | 1 | 8192 | bf16 | ct | auto | -0.4% | +153.4% | -37.5% | -3.9% | +| ibm-granite/granite-3.1-2b-instruct | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.4% | +633.3% | -67.4% | -5.3% | +| ibm-granite/granite-3.1-8b-instruct | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.2% | +547.1% | -67.4% | -6.0% | +| ibm-granite/granite-3.3-8b-instruct | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.2% | +547.1% | -67.4% | -6.0% | +| ibm-granite/granite-vision-3.3-2b | 1 | 1 | 1 | 8192 | bf16 | — | auto | +0.0% | +216.5% | -40.0% | -1.2% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 8192 | bf16 | — | fp8 | -0.2% | +154.0% | -40.0% | -3.5% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.2% | +154.0% | -40.0% | -3.5% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 8192 | bf16 | fp8 | auto | +76.2% | +154.0% | -40.0% | -13.2% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 8192 | f16 | — | auto | -0.2% | +154.0% | -40.0% | -3.5% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 8192 | f32 | — | auto | -50.1% | +117.2% | -40.0% | +31.1% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 2048 | bf16 | — | auto | -0.2% | +154.0% | -40.0% | -3.5% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 4096 | bf16 | — | auto | -0.2% | +154.0% | -40.0% | -3.5% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.2% | +154.0% | -40.0% | -3.5% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 16384 | bf16 | — | auto | -0.2% | +154.0% | -40.0% | -3.5% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 32768 | bf16 | — | auto | -0.2% | +154.0% | -40.0% | -3.5% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 2 | 1 | 8192 | bf16 | — | auto | -0.4% | +336.4% | +114.3% | -0.9% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 4 | 1 | 8192 | bf16 | — | auto | -12.2% | +357.1% | +114.3% | +1.6% | +| meta-llama/Llama-3.1-8B-Instruct | 2 | 1 | 1 | 8192 | bf16 | — | auto | -0.4% | +154.0% | -71.0% | +2.8% | +| meta-llama/Llama-3.1-8B-Instruct | 4 | 1 | 1 | 8192 | bf16 | — | auto | -0.8% | +154.0% | -71.8% | +4.5% | +| microsoft/phi-4 | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.3% | +261.8% | -40.0% | -6.6% | +| mistralai/Mistral-Small-3.1-24B-Instruct-2503 | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.1% | +23.2% | -40.0% | +1.5% | +| mistralai/Mixtral-8x7B-Instruct-v0.1 | 2 | 1 | 1 | 8192 | bf16 | — | auto | -0.0% | +561.2% | -71.0% | -1.9% | +| moonshotai/Kimi-Dev-72B | 2 | 1 | 1 | 8192 | bf16 | — | auto | -0.2% | +144.5% | -71.3% | +61.8% | +| moonshotai/Kimi-Dev-72B | 4 | 1 | 1 | 8192 | bf16 | — | auto | -0.5% | +144.5% | -72.9% | +9.3% | +| moonshotai/Kimi-VL-A3B-Instruct | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.6% | +174.0% | -40.0% | -9.8% | +| moonshotai/Kimi-VL-A3B-Instruct | 2 | 1 | 1 | 8192 | bf16 | — | auto | -1.6% | +180.7% | -71.0% | +2.4% | +| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 8192 | bf16 | — | fp8 | -0.5% | +153.4% | -37.5% | -4.2% | +| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.5% | +153.4% | -37.5% | -4.2% | +| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 2048 | bf16 | — | auto | -0.5% | +153.4% | -37.5% | -4.2% | +| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 4096 | bf16 | — | auto | -0.5% | +153.4% | -37.5% | -4.2% | +| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 16384 | bf16 | — | auto | -0.5% | +153.4% | -37.5% | -4.2% | +| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 32768 | bf16 | — | auto | -0.5% | +153.4% | -37.5% | -4.2% | +| Qwen/Qwen2.5-7B-Instruct | 2 | 1 | 1 | 8192 | bf16 | — | auto | -0.4% | +153.4% | -70.9% | +2.6% | +| Qwen/Qwen2.5-7B-Instruct | 4 | 1 | 1 | 8192 | bf16 | — | auto | -0.1% | +153.4% | -71.8% | +4.6% | +| Qwen/Qwen2.5-72B-Instruct | 2 | 1 | 1 | 8192 | bf16 | — | auto | -0.1% | +144.5% | -71.3% | +60.8% | +| Qwen/Qwen3-8B | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.1% | +153.4% | -40.0% | -4.4% | +| Qwen/Qwen3-30B-A3B | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.0% | +198.5% | -44.4% | -28.8% | +| redhatai/Llama-3.3-70B-Instruct-quantized.w8a8 | 2 | 1 | 1 | 8192 | bf16 | ct | auto | -0.1% | +144.9% | -71.6% | +5.0% | +| RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16 | 1 | 1 | 1 | 8192 | f16 | gptq | auto | -0.7% | +154.0% | -40.0% | -3.0% | +| RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8 | 1 | 1 | 1 | 8192 | f16 | ct | auto | -0.4% | +154.0% | -40.0% | -3.1% | +| RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8 | 1 | 1 | 1 | 8192 | bf16 | ct | auto | -0.2% | +14.7% | -42.3% | +1.2% | +| RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8 | 2 | 1 | 1 | 8192 | bf16 | ct | auto | -0.8% | +23.2% | -71.0% | +5.3% | +| RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8 | 1 | 1 | 1 | 8192 | bf16 | ct | auto | -0.4% | +153.4% | -40.0% | -3.9% | + +## Failed Runs + +| Model | TP | PP | DP | max_len | Notes | +|---|---|---|---|---|---| +| google/gemma-2-2b-it | 1 | 1 | 1 | 8192 | | +| google/gemma-2-9b-it | 1 | 1 | 1 | 8192 | | +| google/gemma-2-27b-it | 1 | 1 | 1 | 8192 | | +| google/gemma-3-4b-it | 1 | 1 | 1 | 8192 | | +| google/gemma-3-12b-it | 1 | 1 | 1 | 8192 | | +| google/gemma-3-27b-it | 1 | 1 | 1 | 8192 | | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 2 | 8192 | DP=2 | +| meta-llama/Llama-4-Scout | 4 | 1 | 1 | 8192 | | +| moonshotai/Kimi-Dev-72B | 4 | 1 | 1 | 8192 | second attempt; TP=2 and TP=4 first attempt succeeded | +| openai/GPT-OSS-20B | 1 | 1 | 1 | 8192 | | +| Qwen/Qwen3-14B | 5 | 1 | 1 | 8192 | TP=5 (non-power-of-2) | + +## Calibration decisions + +_Document constant changes here: old value → new value, evidence._ diff --git a/accuracy/scripts/sweep-codellama.yaml b/accuracy/scripts/sweep-codellama.yaml new file mode 100644 index 00000000..efb4eb77 --- /dev/null +++ b/accuracy/scripts/sweep-codellama.yaml @@ -0,0 +1,21 @@ +defaults: + gpu: H100-80GB + gpu_memory_utilization: "0.95" + max_model_len: 8192 + pp: 1 + dp: 1 + dtype: auto + kv_cache_dtype: auto + quantization: null + vllm_image: vllm/vllm-openai:v0.19.0 + namespace: llmdplanner + results_pvc: vllm-mem-data + node_selector: + nvidia.com/gpu.product: NVIDIA-H100-80GB-HBM3 + +runs: + - model: codellama/CodeLlama-7b-hf # 7B dense; LlamaForCausalLM architecture + tp: 1 + + - model: codellama/CodeLlama-34b-hf # 34B dense; ~65 GiB bf16 + tp: 2 diff --git a/accuracy/scripts/sweep.yaml b/accuracy/scripts/sweep.yaml index 7316a019..75126528 100644 --- a/accuracy/scripts/sweep.yaml +++ b/accuracy/scripts/sweep.yaml @@ -38,8 +38,8 @@ runs: - model: codellama/CodeLlama-7b-hf # 7B dense; LlamaForCausalLM architecture tp: 1 - - model: codellama/CodeLlama-34b-hf # 34B dense; ~65 GiB bf16; tp=1 leaves ~11 GiB KV headroom - tp: 1 # if OOM, retry with tp=2 + - model: codellama/CodeLlama-34b-hf # 34B dense; ~65 GiB bf16; tp=1 OOM risk, using tp=2 + tp: 2 - model: deepseek-ai/DeepSeek-V2-Lite-Chat # 16B total, 2.4B active MoE; DeepSeekV2 arch tp: 1 From 82b1fdfb6bedd1f5a6fcc2a87e15ead4b510fdb7 Mon Sep 17 00:00:00 2001 From: Jing Chen Date: Wed, 22 Apr 2026 13:21:23 -0400 Subject: [PATCH 04/24] Update gemma and codellam res Signed-off-by: Jing Chen --- ...lama-7b-h--h100-80gb--tp1pp1dp1--8192.json | 25 ++++++++++++ ...-2-27b-it--h100-80gb--tp1pp1dp1--8192.json | 29 ++++++++++++++ ...a-2-2b-it--h100-80gb--tp1pp1dp1--8192.json | 29 ++++++++++++++ ...a-2-9b-it--h100-80gb--tp1pp1dp1--8192.json | 29 ++++++++++++++ ...-3-12b-it--h100-80gb--tp1pp1dp1--8192.json | 29 ++++++++++++++ ...-3-27b-it--h100-80gb--tp1pp1dp1--8192.json | 29 ++++++++++++++ ...a-3-4b-it--h100-80gb--tp1pp1dp1--8192.json | 29 ++++++++++++++ accuracy/scripts/sweep.yaml | 38 ++++++++++++------- 8 files changed, 223 insertions(+), 14 deletions(-) create mode 100644 accuracy/results/v0.19.0/runs/codellama-codellama-7b-h--h100-80gb--tp1pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/google-gemma-2-27b-it--h100-80gb--tp1pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/google-gemma-2-2b-it--h100-80gb--tp1pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/google-gemma-2-9b-it--h100-80gb--tp1pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/google-gemma-3-12b-it--h100-80gb--tp1pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/google-gemma-3-27b-it--h100-80gb--tp1pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/google-gemma-3-4b-it--h100-80gb--tp1pp1dp1--8192.json diff --git a/accuracy/results/v0.19.0/runs/codellama-codellama-7b-h--h100-80gb--tp1pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/codellama-codellama-7b-h--h100-80gb--tp1pp1dp1--8192.json new file mode 100644 index 00000000..b619f1ee --- /dev/null +++ b/accuracy/results/v0.19.0/runs/codellama-codellama-7b-h--h100-80gb--tp1pp1dp1--8192.json @@ -0,0 +1,25 @@ +{ + "model": "codellama/CodeLlama-7b-hf", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-22T15:13:28.215479+00:00", + "log_path": "/data/results/v0.19.0/logs/codellama-codellama-7b-h--h100-80gb--tp1pp1dp1--8192.log", + "weight_memory_gib": 12.56, + "kv_cache_memory_gib": 61.66, + "cuda_graph_memory_gib": 0.84, + "max_concurrency": 15.41, + "kv_cache_tokens": 126256, + "vllm_version": null, + "vllm_commit": null, + "kv_cache_blocks": 7891, + "kv_block_size_bytes": 8390181 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/google-gemma-2-27b-it--h100-80gb--tp1pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/google-gemma-2-27b-it--h100-80gb--tp1pp1dp1--8192.json new file mode 100644 index 00000000..d2cc48d1 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/google-gemma-2-27b-it--h100-80gb--tp1pp1dp1--8192.json @@ -0,0 +1,29 @@ +{ + "model": "google/gemma-2-27b-it", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-22T16:01:46.877908+00:00", + "log_path": "/data/results/v0.19.0/logs/google-gemma-2-27b-it--h100-80gb--tp1pp1dp1--8192.log", + "weight_memory_gib": 50.72, + "kv_cache_memory_gib": 20.59, + "cuda_graph_memory_gib": 1.13, + "max_concurrency": 7.16, + "kv_cache_tokens": 58672, + "vllm_version": null, + "vllm_commit": null, + "total_non_kv_memory_gib": 54.64, + "activation_memory_gib": 3.66, + "non_torch_forward_memory_gib": 0.26, + "profiling_weights_memory_gib": 50.72, + "kv_cache_blocks": 3667, + "kv_block_size_bytes": 6029000 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/google-gemma-2-2b-it--h100-80gb--tp1pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/google-gemma-2-2b-it--h100-80gb--tp1pp1dp1--8192.json new file mode 100644 index 00000000..eb38fb0c --- /dev/null +++ b/accuracy/results/v0.19.0/runs/google-gemma-2-2b-it--h100-80gb--tp1pp1dp1--8192.json @@ -0,0 +1,29 @@ +{ + "model": "google/gemma-2-2b-it", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-22T15:56:16.000686+00:00", + "log_path": "/data/results/v0.19.0/logs/google-gemma-2-2b-it--h100-80gb--tp1pp1dp1--8192.log", + "weight_memory_gib": 4.9, + "kv_cache_memory_gib": 66.47, + "cuda_graph_memory_gib": 0.52, + "max_concurrency": 81.73, + "kv_cache_tokens": 670160, + "vllm_version": null, + "vllm_commit": null, + "total_non_kv_memory_gib": 8.76, + "activation_memory_gib": 3.62, + "non_torch_forward_memory_gib": 0.24, + "profiling_weights_memory_gib": 4.9, + "kv_cache_blocks": 41885, + "kv_block_size_bytes": 1703989 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/google-gemma-2-9b-it--h100-80gb--tp1pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/google-gemma-2-9b-it--h100-80gb--tp1pp1dp1--8192.json new file mode 100644 index 00000000..d5d0b387 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/google-gemma-2-9b-it--h100-80gb--tp1pp1dp1--8192.json @@ -0,0 +1,29 @@ +{ + "model": "google/gemma-2-9b-it", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-22T15:57:56.449808+00:00", + "log_path": "/data/results/v0.19.0/logs/google-gemma-2-9b-it--h100-80gb--tp1pp1dp1--8192.log", + "weight_memory_gib": 17.22, + "kv_cache_memory_gib": 54.12, + "cuda_graph_memory_gib": 0.84, + "max_concurrency": 20.6, + "kv_cache_tokens": 168880, + "vllm_version": null, + "vllm_commit": null, + "total_non_kv_memory_gib": 21.11, + "activation_memory_gib": 3.65, + "non_torch_forward_memory_gib": 0.25, + "profiling_weights_memory_gib": 17.22, + "kv_cache_blocks": 10555, + "kv_block_size_bytes": 5505533 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/google-gemma-3-12b-it--h100-80gb--tp1pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/google-gemma-3-12b-it--h100-80gb--tp1pp1dp1--8192.json new file mode 100644 index 00000000..93d21dbc --- /dev/null +++ b/accuracy/results/v0.19.0/runs/google-gemma-3-12b-it--h100-80gb--tp1pp1dp1--8192.json @@ -0,0 +1,29 @@ +{ + "model": "google/gemma-3-12b-it", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-22T16:06:47.862260+00:00", + "log_path": "/data/results/v0.19.0/logs/google-gemma-3-12b-it--h100-80gb--tp1pp1dp1--8192.log", + "weight_memory_gib": 23.31, + "kv_cache_memory_gib": 47.72, + "cuda_graph_memory_gib": 1.02, + "max_concurrency": 15.88, + "kv_cache_tokens": 130304, + "vllm_version": null, + "vllm_commit": null, + "total_non_kv_memory_gib": 27.51, + "activation_memory_gib": 3.94, + "non_torch_forward_memory_gib": 0.25, + "profiling_weights_memory_gib": 23.31, + "kv_cache_blocks": 8144, + "kv_block_size_bytes": 6291620 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/google-gemma-3-27b-it--h100-80gb--tp1pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/google-gemma-3-27b-it--h100-80gb--tp1pp1dp1--8192.json new file mode 100644 index 00000000..9937c6c8 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/google-gemma-3-27b-it--h100-80gb--tp1pp1dp1--8192.json @@ -0,0 +1,29 @@ +{ + "model": "google/gemma-3-27b-it", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-22T16:10:38.908929+00:00", + "log_path": "/data/results/v0.19.0/logs/google-gemma-3-27b-it--h100-80gb--tp1pp1dp1--8192.log", + "weight_memory_gib": 51.45, + "kv_cache_memory_gib": 19.53, + "cuda_graph_memory_gib": 1.05, + "max_concurrency": 4.46, + "kv_cache_tokens": 36560, + "vllm_version": null, + "vllm_commit": null, + "total_non_kv_memory_gib": 55.7, + "activation_memory_gib": 3.99, + "non_torch_forward_memory_gib": 0.26, + "profiling_weights_memory_gib": 51.45, + "kv_cache_blocks": 2285, + "kv_block_size_bytes": 9177320 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/google-gemma-3-4b-it--h100-80gb--tp1pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/google-gemma-3-4b-it--h100-80gb--tp1pp1dp1--8192.json new file mode 100644 index 00000000..768b7446 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/google-gemma-3-4b-it--h100-80gb--tp1pp1dp1--8192.json @@ -0,0 +1,29 @@ +{ + "model": "google/gemma-3-4b-it", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-22T16:03:57.416575+00:00", + "log_path": "/data/results/v0.19.0/logs/google-gemma-3-4b-it--h100-80gb--tp1pp1dp1--8192.log", + "weight_memory_gib": 8.58, + "kv_cache_memory_gib": 62.51, + "cuda_graph_memory_gib": 0.7, + "max_concurrency": 57.05, + "kv_cache_tokens": 468144, + "vllm_version": null, + "vllm_commit": null, + "total_non_kv_memory_gib": 12.72, + "activation_memory_gib": 3.89, + "non_torch_forward_memory_gib": 0.25, + "profiling_weights_memory_gib": 8.58, + "kv_cache_blocks": 29259, + "kv_block_size_bytes": 2293981 +} \ No newline at end of file diff --git a/accuracy/scripts/sweep.yaml b/accuracy/scripts/sweep.yaml index 75126528..2feb7647 100644 --- a/accuracy/scripts/sweep.yaml +++ b/accuracy/scripts/sweep.yaml @@ -87,20 +87,30 @@ runs: tp: 4 # tp=1 OOM (~212 GiB total), tp=2 OOM (~106 GiB total) # ── Gemma models ────────────────────────────────────────────────────────── - # BLOCKED: HF token needs access granted at https://huggingface.co/google/gemma-2-2b-it - # (GatedRepoError 403 — all google/gemma-* repos require explicit access approval) - # - model: google/gemma-2-2b-it # 2.6B dense; Gemma2 architecture - # tp: 1 - # - model: google/gemma-2-9b-it # 9.2B dense; Gemma2 architecture - # tp: 1 - # - model: google/gemma-2-27b-it # 27.2B dense; fits H100 80GB at bf16 (~54 GiB weights) - # tp: 1 - # - model: google/gemma-3-4b-it # 4B dense; Gemma3 architecture - # tp: 1 - # - model: google/gemma-3-12b-it # 12B dense; Gemma3 architecture - # tp: 1 - # - model: google/gemma-3-27b-it # 27B dense; Gemma3 architecture - # tp: 1 + # Requires hf-token-gemma secret (separate from hf-token; Gemma repos are gated). + - model: google/gemma-2-2b-it # 2.6B dense; Gemma2 architecture + tp: 1 + hf_token_secret: hf-token-gemma + + - model: google/gemma-2-9b-it # 9.2B dense; Gemma2 architecture + tp: 1 + hf_token_secret: hf-token-gemma + + - model: google/gemma-2-27b-it # 27.2B dense; fits H100 80GB at bf16 (~54 GiB weights) + tp: 1 + hf_token_secret: hf-token-gemma + + - model: google/gemma-3-4b-it # 4B dense; Gemma3 architecture + tp: 1 + hf_token_secret: hf-token-gemma + + - model: google/gemma-3-12b-it # 12B dense; Gemma3 architecture + tp: 1 + hf_token_secret: hf-token-gemma + + - model: google/gemma-3-27b-it # 27B dense; Gemma3 architecture + tp: 1 + hf_token_secret: hf-token-gemma # ── Kimi Dev 72B TP sensitivity (retry tp=4; tp=2 succeeded) ───────────── - model: moonshotai/Kimi-Dev-72B From 30ec05f7cab43782bdfa4f529aef70e06ff9f149 Mon Sep 17 00:00:00 2001 From: Jing Chen Date: Wed, 22 Apr 2026 14:13:22 -0400 Subject: [PATCH 05/24] Update report Signed-off-by: Jing Chen --- accuracy/k8s/orchestrator-job-gemma.yaml | 46 ++++ accuracy/results/v0.19.0/accuracy_report.md | 221 +++++++++--------- .../results/v0.19.0/results_predicted.csv | 7 + accuracy/results/v0.19.0/results_raw.csv | 8 + accuracy/results/v0.19.0/run_matrix.md | 43 ++-- 5 files changed, 198 insertions(+), 127 deletions(-) create mode 100644 accuracy/k8s/orchestrator-job-gemma.yaml diff --git a/accuracy/k8s/orchestrator-job-gemma.yaml b/accuracy/k8s/orchestrator-job-gemma.yaml new file mode 100644 index 00000000..acc7f303 --- /dev/null +++ b/accuracy/k8s/orchestrator-job-gemma.yaml @@ -0,0 +1,46 @@ +# Submit with: kubectl apply -f accuracy/k8s/orchestrator-job-gemma.yaml +# Monitor with: kubectl logs -f job/vllm-mem-orchestrator-gemma -n llmdplanner +apiVersion: batch/v1 +kind: Job +metadata: + name: vllm-mem-orchestrator-gemma + namespace: llmdplanner +spec: + backoffLimit: 0 + activeDeadlineSeconds: 28800 # 8-hour cap for 6-model run + template: + spec: + serviceAccountName: vllm-mem-orchestrator + restartPolicy: Never + volumes: + - name: data + persistentVolumeClaim: + claimName: vllm-mem-data + - name: scripts + configMap: + name: vllm-mem-scripts + defaultMode: 0755 + - name: sweep + configMap: + name: vllm-mem-sweep-gemma + containers: + - name: orchestrator + image: python:3.11-slim + command: ["/bin/bash", "-c"] + args: + - | + pip install pyyaml kubernetes --quiet --no-cache-dir && + python /scripts/sweep_runner.py \ + --config /sweep/sweep-gemma.yaml \ + --results /data/results/ + volumeMounts: + - name: data + mountPath: /data + - name: scripts + mountPath: /scripts + - name: sweep + mountPath: /sweep + resources: + requests: + cpu: "500m" + memory: "512Mi" diff --git a/accuracy/results/v0.19.0/accuracy_report.md b/accuracy/results/v0.19.0/accuracy_report.md index 615483d6..c6172ae9 100644 --- a/accuracy/results/v0.19.0/accuracy_report.md +++ b/accuracy/results/v0.19.0/accuracy_report.md @@ -1,6 +1,6 @@ # Capacity Planner Accuracy Report — vLLM v0.19.0 / H100-80GB -**Dataset**: 47 successful runs across 22 unique models +**Dataset**: 54 successful runs across 29 unique models **Hardware**: H100-80GB (catalog memory = 80 GiB, actual = ~79.19 GiB) **Planner GPU util**: actual `gpu_memory_utilization` per run (0.95) @@ -8,18 +8,18 @@ | Metric | Mean error | Mean abs error | Notes | |--------|:----------:|:--------------:|-------| -| **KV Cache memory** (all 47 runs) | +0.83% | +7.91% | | -| **KV Cache memory** (baseline: tp=pp=1, len=8192, no-quant) | -4.11% | — | n=16 | -| **Weight memory** | -0.03% | +3.27% | From safetensors metadata | -| **Activation memory** | +196.08% | +196.08% | Largest error source | -| **Non-torch overhead** | -43.67% | +53.40% | | -| **Max concurrency** | -1.57% | +15.90% | Proxy for KV cache accuracy | +| **KV Cache memory** (all 47 runs) | +71.11% | +100.71% | | +| **KV Cache memory** (baseline: tp=pp=1, len=8192, no-quant) | +40.12% | — | n=23 | +| **Weight memory** | +86.28% | +132.35% | From safetensors metadata | +| **Activation memory** | +188.51% | +189.86% | Largest error source | +| **Non-torch overhead** | -22.26% | +68.14% | | +| **Max concurrency** | +286.69% | +345.88% | Proxy for KV cache accuracy | ### Key Findings -1. **Weights are accurate** — mean abs error +3.27%, computed directly from safetensors parameter counts. Errors arise only when `--dtype` overrides the native dtype (e.g., `--dtype float32`) or when quantization is not fully captured in the config. -2. **Activation is the dominant error source** — mean +196.08% (over-estimate). The planner uses empirical constants (4.8–8.0 GiB) measured at `max_model_len=16000`; vLLM v0.19.0 reports 0.75–2.2 GiB across all architectures tested. Granite is worst (+600%), Mistral3/Pixtral is best (+15–23%). -3. **Over-estimated activation partially cancels** the catalog GPU memory inflation (+0.77 GiB), leaving KV cache only +0.83% off on average across all runs. But this is coincidental cancellation of two large opposing errors, not model accuracy. +1. **Weights are accurate** — mean abs error +132.35%, computed directly from safetensors parameter counts. Errors arise only when `--dtype` overrides the native dtype (e.g., `--dtype float32`) or when quantization is not fully captured in the config. +2. **Activation is the dominant error source** — mean +188.51% (over-estimate). The planner uses empirical constants (4.8–8.0 GiB) measured at `max_model_len=16000`; vLLM v0.19.0 reports 0.75–2.2 GiB across all architectures tested. Granite is worst (+600%), Mistral3/Pixtral is best (+15–23%). +3. **Over-estimated activation partially cancels** the catalog GPU memory inflation (+0.77 GiB), leaving KV cache only +71.11% off on average across all runs. But this is coincidental cancellation of two large opposing errors, not model accuracy. 4. **Non-default KV dtype (`--kv-cache-dtype fp8`) doubles token capacity** but the planner ignores this flag — KV token count is off by ~2× for those runs. 5. **`--dtype float32` breaks weight prediction** — the planner uses the HuggingFace config dtype (BF16) and never sees the vLLM `--dtype` override, giving −50% weight error. 6. **Pipeline parallelism reduces actual activation** (each GPU processes fewer layers) but the formula uses the same constant regardless of PP, compounding the activation error. @@ -29,53 +29,53 @@ > Percent error = (predicted − actual) / actual × 100. Positive = over-estimate, negative = under-estimate. -### All 47 Runs (n=47) +### All 47 Runs (n=54) | Component | Mean error | Median | Mean abs | Min | Max | n | |-----------|:----------:|:------:|:--------:|:---:|:---:|:-:| -| Weight | -0.03% | -0.31% | +3.27% | -50.11% | +76.18% | 47 | -| Activation | +196.08% | +153.97% | +196.08% | +14.68% | +633.33% | 47 | -| Non Torch | -43.67% | -40.00% | +53.40% | -72.85% | +114.29% | 47 | -| Cuda Graph | -100.00% | -100.00% | +100.00% | -100.00% | -100.00% | 47 | -| Total Non Kv | +19.52% | +16.70% | +21.45% | -38.61% | +97.49% | 47 | -| Kv Cache | +0.83% | -3.47% | +7.91% | -28.75% | +61.82% | 47 | -| Max Concurrency | -1.57% | -3.48% | +15.90% | -87.31% | +162.10% | 47 | +| Weight | +86.28% | -0.41% | +132.35% | -90.70% | +1696.10% | 54 | +| Activation | +188.51% | +153.39% | +189.86% | -36.55% | +540.00% | 54 | +| Non Torch | -22.26% | -40.00% | +68.14% | -93.21% | +150.00% | 54 | +| Cuda Graph | -100.00% | -100.00% | +100.00% | -100.00% | -100.00% | 54 | +| Total Non Kv | +74.84% | +17.66% | +108.13% | -81.02% | +976.70% | 54 | +| Kv Cache | +71.11% | -3.47% | +100.71% | -92.75% | +1756.45% | 54 | +| Max Concurrency | +286.69% | -2.58% | +345.88% | -98.55% | +5217.43% | 54 | -### Baseline: TP=1, PP=1, len=8192, no quantization, default KV dtype (n=16) +### Baseline: TP=1, PP=1, len=8192, no quantization, default KV dtype (n=23) | Component | Mean error | Median | Mean abs | Min | Max | n | |-----------|:----------:|:------:|:--------:|:---:|:---:|:-:| -| Weight | -3.38% | -0.22% | +3.39% | -50.11% | +0.04% | 16 | -| Activation | +247.20% | +163.97% | +247.20% | +23.15% | +633.33% | 16 | -| Non Torch | -45.25% | -40.00% | +45.25% | -67.39% | -37.50% | 16 | -| Cuda Graph | -100.00% | -100.00% | +100.00% | -100.00% | -100.00% | 16 | -| Total Non Kv | +17.06% | +17.29% | +21.89% | -38.61% | +74.27% | 16 | -| Kv Cache | -4.11% | -4.29% | +8.18% | -28.75% | +31.06% | 16 | -| Max Concurrency | -0.76% | -4.29% | +21.22% | -87.31% | +162.10% | 16 | +| Weight | -6.31% | -44.21% | +64.61% | -90.70% | +215.56% | 23 | +| Activation | +206.57% | +153.39% | +209.75% | -36.55% | +540.00% | 23 | +| Non Torch | -19.94% | -40.00% | +57.33% | -67.39% | +150.00% | 23 | +| Cuda Graph | -100.00% | -100.00% | +100.00% | -100.00% | -100.00% | 23 | +| Total Non Kv | +6.55% | -18.08% | +58.84% | -81.02% | +234.58% | 23 | +| Kv Cache | +40.12% | +8.07% | +48.56% | -19.04% | +306.16% | 23 | +| Max Concurrency | +219.13% | +104.27% | +247.44% | -81.60% | +1366.62% | 23 | ### Multi-GPU (TP > 1 or PP > 1) (n=15) | Component | Mean error | Median | Mean abs | Min | Max | n | |-----------|:----------:|:------:|:--------:|:---:|:---:|:-:| -| Weight | -1.20% | -0.38% | +1.20% | -12.22% | -0.03% | 15 | -| Activation | +196.02% | +153.39% | +196.02% | +23.15% | +561.16% | 15 | -| Non Torch | -46.75% | -71.29% | +77.23% | -72.85% | +114.29% | 15 | +| Weight | +255.46% | +55.63% | +285.52% | -79.08% | +1696.10% | 15 | +| Activation | +174.46% | +144.54% | +174.46% | +13.12% | +400.00% | 15 | +| Non Torch | -56.76% | -72.85% | +87.23% | -93.21% | +114.29% | 15 | | Cuda Graph | -100.00% | -100.00% | +100.00% | -100.00% | -100.00% | 15 | -| Total Non Kv | +16.86% | +11.34% | +17.76% | -6.76% | +97.49% | 15 | -| Kv Cache | +11.26% | +4.62% | +11.63% | -1.88% | +61.82% | 15 | -| Max Concurrency | +6.58% | +4.63% | +16.32% | -71.19% | +62.24% | 15 | +| Total Non Kv | +178.64% | +57.97% | +204.71% | -72.39% | +976.70% | 15 | +| Kv Cache | +209.57% | -19.27% | +265.99% | -92.75% | +1756.45% | 15 | +| Max Concurrency | +652.26% | -77.10% | +752.16% | -98.55% | +5217.43% | 15 | ### Quantized Models (fp8-dynamic / w8a8 / w4a16) (n=10) | Component | Mean error | Median | Mean abs | Min | Max | n | |-----------|:----------:|:------:|:--------:|:---:|:---:|:-:| -| Weight | +7.29% | -0.29% | +7.94% | -0.79% | +76.18% | 10 | -| Activation | +124.00% | +149.15% | +124.00% | +14.68% | +153.97% | 10 | -| Non Torch | -52.67% | -41.15% | +52.67% | -72.85% | -37.50% | 10 | +| Weight | +58.68% | -0.30% | +69.12% | -28.48% | +501.85% | 10 | +| Activation | +163.06% | +153.68% | +163.06% | +143.65% | +191.01% | 10 | +| Non Torch | -56.97% | -41.15% | +56.97% | -92.89% | -37.50% | 10 | | Cuda Graph | -100.00% | -100.00% | +100.00% | -100.00% | -100.00% | 10 | -| Total Non Kv | +21.87% | +15.87% | +23.22% | -6.76% | +87.45% | 10 | -| Kv Cache | -0.45% | -0.87% | +4.95% | -13.18% | +5.90% | 10 | -| Max Concurrency | -0.44% | -0.86% | +4.95% | -13.19% | +5.89% | 10 | +| Total Non Kv | +66.95% | +29.93% | +69.69% | -13.72% | +433.84% | 10 | +| Kv Cache | -12.54% | -3.43% | +16.96% | -70.20% | +9.04% | 10 | +| Max Concurrency | -23.06% | -25.97% | +49.90% | -92.31% | +104.21% | 10 | ### Non-default KV cache dtype (--kv-cache-dtype fp8) (n=2) @@ -87,7 +87,7 @@ | Cuda Graph | -100.00% | -100.00% | +100.00% | -100.00% | -100.00% | 2 | | Total Non Kv | +17.83% | +17.83% | +17.83% | +16.28% | +19.37% | 2 | | Kv Cache | -3.84% | -3.84% | +3.84% | -4.21% | -3.47% | 2 | -| Max Concurrency | -51.92% | -51.92% | +51.92% | -52.11% | -51.73% | 2 | +| Max Concurrency | -3.84% | -3.84% | +3.84% | -4.22% | -3.47% | 2 | ## Per-Model Errors — Baseline Runs @@ -95,22 +95,29 @@ | Model | Arch | Weight err | Activation err | Non-torch err | KV cache err | Max conc err | |-------|------|:----------:|:--------------:|:-------------:|:------------:|:------------:| -| Qwen2.5-7B-Instruct | Qwen2 | -0.45% | +153.39% | -37.50% | -4.21% | -4.22% | -| Qwen2.5-7B-Instruct | Qwen2 | -0.45% | +153.39% | -37.50% | -4.21% | -4.22% | -| Qwen3-30B-A3B | Qwen3Moe | -0.02% | +198.51% | -44.44% | -28.75% | -28.72% | -| Qwen3-8B | Qwen3 | -0.09% | +153.39% | -40.00% | -4.36% | -4.36% | +| Qwen2.5-7B-Instruct | Qwen2 | -50.23% | +153.39% | +150.00% | +11.92% | +123.83% | +| Qwen2.5-7B-Instruct | Llama | -62.51% | +117.19% | -37.50% | +12.26% | -50.89% | +| Qwen3-30B-A3B | Llama | -85.13% | +79.10% | -44.44% | +306.16% | +204.72% | +| Qwen3-8B | Qwen2 | -46.89% | +153.39% | -40.00% | +8.07% | +177.89% | +| CodeLlama-7b-hf | Llama | -0.07% | +523.38% | -40.00% | -5.13% | -5.13% | | DeepSeek-V2-Lite-Chat | DeepseekV2 | -0.59% | +314.51% | -42.31% | -11.50% | -11.50% | -| granite-3.1-2b-instruct | Granite | -0.44% | +633.33% | -67.39% | -5.27% | -5.27% | -| granite-3.1-8b-instruct | Granite | -0.20% | +547.06% | -67.39% | -6.02% | -6.03% | -| granite-3.3-8b-instruct | Granite | -0.20% | +547.06% | -67.39% | -6.02% | -6.03% | -| granite-vision-3.3-2b | LlavaNext* | +0.04% | +216.46% | -40.00% | -1.23% | -1.23% | +| gemma-2-27b-it | Granite | -90.70% | +50.27% | -42.31% | +218.75% | +1366.62% | +| gemma-2-2b-it | Granite | +210.60% | +51.93% | -37.50% | -17.06% | -46.04% | +| gemma-2-9b-it | Granite | -11.62% | +50.68% | -40.00% | +1.87% | +114.08% | +| gemma-3-12b-it | LlavaNext* | -76.22% | -36.55% | -40.00% | +42.10% | +583.19% | +| gemma-3-27b-it | Llama | -70.93% | +20.30% | -42.31% | +187.21% | +1157.62% | +| gemma-3-4b-it | Llama | +74.33% | +23.39% | -40.00% | -10.27% | -1.68% | +| granite-3.1-2b-instruct | Llama | +215.56% | +540.00% | -67.39% | -19.04% | -49.40% | +| granite-3.1-8b-instruct | Llama | -1.92% | +464.71% | -67.39% | -4.38% | +19.52% | +| granite-3.3-8b-instruct | Llama | -1.92% | +464.71% | -67.39% | -4.38% | +19.52% | +| granite-vision-3.3-2b | Llama | +169.99% | +507.59% | -40.00% | -18.29% | +104.27% | | Llama-3.1-8B-Instruct | Llama | -0.22% | +153.97% | -40.00% | -3.47% | -3.48% | -| Llama-3.1-8B-Instruct | Llama | -0.22% | +153.97% | -40.00% | -3.47% | -3.48% | -| Llama-3.1-8B-Instruct | Llama | -50.11% | +117.19% | -40.00% | +31.06% | +162.10% | -| Llama-3.1-8B-Instruct | Llama | -0.22% | +153.97% | -40.00% | -3.47% | -3.48% | -| phi-4 | Phi3 | -0.31% | +261.84% | -40.00% | -6.59% | -6.58% | -| Mistral-Small-3.1-24B-Instruct-2503 | Mistral3* | -0.08% | +23.15% | -40.00% | +1.54% | +1.55% | -| Kimi-VL-A3B-Instruct | KimiVL* | -0.58% | +173.97% | -40.00% | -9.76% | -87.31% | +| Llama-3.1-8B-Instruct | Llama | -75.05% | +153.97% | -40.00% | +22.03% | +388.11% | +| Llama-3.1-8B-Instruct | Llama | -75.05% | +117.19% | +140.00% | +53.09% | +512.34% | +| Llama-3.1-8B-Instruct | Llama | -0.22% | +153.97% | -40.00% | -3.47% | -75.87% | +| phi-4 | KimiVL* | -44.21% | +426.32% | +140.00% | +21.79% | +125.53% | +| Mistral-Small-3.1-24B-Instruct-2503 | Qwen2 | -68.31% | +175.86% | -40.00% | +98.88% | +468.29% | +| Kimi-VL-A3B-Instruct | Qwen2 | -53.85% | +91.78% | -40.00% | +35.68% | -81.60% | ## Argument Sensitivity Analysis @@ -120,20 +127,20 @@ | Model | max_model_len | Actual KV (GiB) | KV err | Actual tokens | Pred tokens | Token err | Max conc err | |-------|:-------------:|:---------------:|:------:|:-------------:|:-----------:|:---------:|:------------:| -| Llama-3.1-8B-Instruct | 2,048 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | -3.47% | -| Llama-3.1-8B-Instruct | 4,096 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | -3.47% | +| Llama-3.1-8B-Instruct | 2,048 | 58.11 | +21.25% | 476,016 | 2,308,853 | +385.04% | +21.26% | +| Llama-3.1-8B-Instruct | 4,096 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | -75.86% | | Llama-3.1-8B-Instruct | 8,192 | 58.11 | -3.47% | 476,000 | 459,509 | -3.46% | -3.48% | -| Llama-3.1-8B-Instruct | 8,192 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | -3.48% | -| Llama-3.1-8B-Instruct | 8,192 | 42.80 | +31.06% | 175,296 | 459,509 | +162.13% | +162.10% | -| Llama-3.1-8B-Instruct | 8,192 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | -3.48% | -| Llama-3.1-8B-Instruct | 16,384 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | -3.44% | -| Llama-3.1-8B-Instruct | 32,768 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | -3.51% | -| Qwen2.5-7B-Instruct | 2,048 | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | -4.22% | -| Qwen2.5-7B-Instruct | 4,096 | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | -4.22% | -| Qwen2.5-7B-Instruct | 8,192 | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | -4.22% | -| Qwen2.5-7B-Instruct | 8,192 | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | -4.22% | -| Qwen2.5-7B-Instruct | 16,384 | 58.53 | -4.21% | 1,095,968 | 1,049,789 | -4.21% | -4.22% | -| Qwen2.5-7B-Instruct | 32,768 | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | -4.22% | +| Llama-3.1-8B-Instruct | 8,192 | 58.11 | +22.03% | 476,016 | 2,323,599 | +388.13% | +388.11% | +| Llama-3.1-8B-Instruct | 8,192 | 42.80 | +53.09% | 175,296 | 1,073,499 | +512.39% | +512.34% | +| Llama-3.1-8B-Instruct | 8,192 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | -75.87% | +| Llama-3.1-8B-Instruct | 16,384 | 58.11 | -30.92% | 476,016 | 526,169 | +10.54% | +121.10% | +| Llama-3.1-8B-Instruct | 32,768 | 58.11 | -35.83% | 476,016 | 181,017 | -61.97% | +52.10% | +| Qwen2.5-7B-Instruct | 2,048 | 58.53 | -6.04% | 1,096,000 | 400,450 | -63.46% | -90.87% | +| Qwen2.5-7B-Instruct | 4,096 | 58.53 | -33.09% | 1,096,000 | 256,642 | -76.58% | -88.29% | +| Qwen2.5-7B-Instruct | 8,192 | 58.53 | +11.92% | 1,096,000 | 2,453,196 | +123.83% | +123.83% | +| Qwen2.5-7B-Instruct | 8,192 | 58.53 | +12.26% | 1,096,000 | 538,282 | -50.89% | -50.89% | +| Qwen2.5-7B-Instruct | 16,384 | 58.53 | +20.37% | 1,095,968 | 5,276,861 | +381.48% | +863.00% | +| Qwen2.5-7B-Instruct | 32,768 | 58.53 | -81.24% | 1,096,000 | 119,925 | -89.06% | -56.23% | **Conclusion**: `--max-model-len` has **no effect on KV pool size** — the formula and vLLM agree on this. Activation memory is constant (the fixed profiling overhead does not depend on context length), so the KV pool prediction error stays flat at ~−3 to −4% regardless of whether context is 2 K or 32 K tokens. The token/concurrency predictions carry that same constant KV error forward, plus any error from the per-token KV formula. @@ -142,15 +149,15 @@ | Model | TP | Actual weight (GiB) | Weight err | Actual activ (GiB) | Activation err | Actual non-torch (GiB) | Non-torch err | KV cache err | |-------|:--:|:-------------------:|:----------:|:------------------:|:--------------:|:---------------------:|:-------------:|:------------:| | Llama-3.1-8B-Instruct | 1 | 14.99 | -0.22% | 1.89 | +153.97% | 0.25 | -40.00% | -3.47% | +| Llama-3.1-8B-Instruct | 1 | 14.99 | -75.05% | 1.89 | +153.97% | 0.25 | -40.00% | +22.03% | +| Llama-3.1-8B-Instruct | 1 | 29.98 | -75.05% | 2.21 | +117.19% | 0.25 | +140.00% | +53.09% | | Llama-3.1-8B-Instruct | 1 | 14.99 | -0.22% | 1.89 | +153.97% | 0.25 | -40.00% | -3.47% | -| Llama-3.1-8B-Instruct | 1 | 29.98 | -50.11% | 2.21 | +117.19% | 0.25 | -40.00% | +31.06% | -| Llama-3.1-8B-Instruct | 1 | 14.99 | -0.22% | 1.89 | +153.97% | 0.25 | -40.00% | -3.47% | -| Llama-3.1-8B-Instruct | 2 | 7.51 | -0.42% | 1.89 | +153.97% | 2.07 | -71.01% | +2.76% | -| Llama-3.1-8B-Instruct | 4 | 3.77 | -0.81% | 1.89 | +153.97% | 2.13 | -71.83% | +4.48% | -| Qwen2.5-7B-Instruct | 1 | 14.25 | -0.45% | 2.21 | +153.39% | 0.24 | -37.50% | -4.21% | -| Qwen2.5-7B-Instruct | 1 | 14.25 | -0.45% | 2.21 | +153.39% | 0.24 | -37.50% | -4.21% | -| Qwen2.5-7B-Instruct | 2 | 7.12 | -0.38% | 2.21 | +153.39% | 2.06 | -70.87% | +2.61% | -| Qwen2.5-7B-Instruct | 4 | 3.55 | -0.10% | 2.21 | +153.39% | 2.13 | -71.83% | +4.62% | +| Llama-3.1-8B-Instruct | 2 | 7.51 | +479.17% | 1.89 | +323.28% | 2.07 | -71.01% | -56.23% | +| Llama-3.1-8B-Instruct | 4 | 3.77 | +1696.10% | 1.89 | +196.30% | 2.13 | -71.83% | -92.75% | +| Qwen2.5-7B-Instruct | 1 | 14.25 | -50.23% | 2.21 | +153.39% | 0.24 | +150.00% | +11.92% | +| Qwen2.5-7B-Instruct | 1 | 14.25 | -62.51% | 2.21 | +117.19% | 0.24 | -37.50% | +12.26% | +| Qwen2.5-7B-Instruct | 2 | 7.12 | +237.47% | 2.21 | +13.12% | 2.06 | -92.72% | -22.74% | +| Qwen2.5-7B-Instruct | 4 | 3.55 | +238.42% | 2.21 | +13.12% | 2.13 | -71.83% | -7.73% | **Conclusions**: @@ -163,11 +170,11 @@ | PP | Actual weight (GiB) | Weight err | Actual activ (GiB) | Activation err | Actual non-torch (GiB) | Non-torch err | KV cache err | |:--:|:-------------------:|:----------:|:------------------:|:--------------:|:---------------------:|:-------------:|:------------:| | 1 | 14.99 | -0.22% | 1.89 | +153.97% | 0.25 | -40.00% | -3.47% | +| 1 | 14.99 | -75.05% | 1.89 | +153.97% | 0.25 | -40.00% | +22.03% | +| 1 | 29.98 | -75.05% | 2.21 | +117.19% | 0.25 | +140.00% | +53.09% | | 1 | 14.99 | -0.22% | 1.89 | +153.97% | 0.25 | -40.00% | -3.47% | -| 1 | 29.98 | -50.11% | 2.21 | +117.19% | 0.25 | -40.00% | +31.06% | -| 1 | 14.99 | -0.22% | 1.89 | +153.97% | 0.25 | -40.00% | -3.47% | -| 2 | 7.51 | -0.42% | 1.10 | +336.36% | 0.07 | +114.29% | -0.85% | -| 4 | 4.26 | -12.22% | 1.05 | +357.14% | 0.07 | +114.29% | +1.59% | +| 2 | 7.51 | +263.59% | 1.10 | +400.00% | 0.07 | +114.29% | -35.31% | +| 4 | 4.26 | +949.87% | 1.05 | +138.10% | 0.07 | +114.29% | -58.99% | **Conclusions**: @@ -182,11 +189,11 @@ | bfloat16 | None | auto | 14.99 | -0.22% | 58.11 | -3.47% | | bfloat16 | None | auto | 14.99 | -0.22% | 58.11 | -3.47% | | bfloat16 | None | fp8 | 14.99 | -0.22% | 58.11 | -3.47% | -| bfloat16 | fp8 | auto | 8.49 | +76.18% | 64.61 | -13.18% | -| float16 | None | auto | 14.99 | -0.22% | 58.11 | -3.47% | -| float16 | compressed-tensors | auto | 8.49 | -0.35% | 64.60 | -3.11% | -| float16 | gptq_marlin | auto | 5.38 | -0.71% | 67.71 | -2.96% | -| float32 | None | auto | 29.98 | -50.11% | 42.80 | +31.06% | +| bfloat16 | fp8 | auto | 8.49 | -11.91% | 64.61 | +2.11% | +| float16 | None | auto | 14.99 | -75.05% | 58.11 | +22.03% | +| float16 | compressed-tensors | auto | 8.49 | +501.85% | 64.60 | -70.20% | +| float16 | gptq_marlin | auto | 5.38 | -9.49% | 67.71 | -3.29% | +| float32 | None | auto | 29.98 | -75.05% | 42.80 | +53.09% | **Conclusions**: @@ -201,14 +208,14 @@ |-------|--------------|----|:-------------------:|:----------:|:---------------:|:------:| | Llama-3.3-70B-Instruct-fp8-dyn | compressed-tensors | 2 | 33.88 | -0.12% | 37.28 | +5.04% | | Llama-3.3-70B-Instruct-fp8-dyn | compressed-tensors | 4 | 16.96 | -0.24% | 54.09 | +5.90% | -| Meta-Llama-3.1-8B-Instruct-qua | compressed-tensors | 1 | 8.49 | -0.35% | 64.60 | -3.11% | -| Mistral-Small-3.1-24B-Instruct | compressed-tensors | 1 | 24.07 | -0.18% | 48.73 | +1.22% | -| Mistral-Small-3.1-24B-Instruct | compressed-tensors | 2 | 12.11 | -0.79% | 59.02 | +5.28% | +| Meta-Llama-3.1-8B-Instruct-qua | compressed-tensors | 1 | 8.49 | +501.85% | 64.60 | -70.20% | +| Mistral-Small-3.1-24B-Instruct | compressed-tensors | 1 | 24.07 | -28.48% | 48.73 | +9.04% | +| Mistral-Small-3.1-24B-Instruct | compressed-tensors | 2 | 12.11 | +87.45% | 59.02 | -19.27% | | Qwen2.5-7B-Instruct-fp8-dynami | compressed-tensors | 1 | 8.14 | -0.36% | 64.64 | -3.87% | -| Qwen2.5-7B-Instruct-quantized. | compressed-tensors | 1 | 8.14 | -0.36% | 64.64 | -3.87% | -| Llama-3.3-70B-Instruct-quantiz | compressed-tensors | 2 | 33.88 | -0.12% | 37.28 | +5.04% | -| Llama-3.1-8B-Instruct | fp8 | 1 | 8.49 | +76.18% | 64.61 | -13.18% | -| Meta-Llama-3.1-8B-Instruct-qua | gptq_marlin | 1 | 5.38 | -0.71% | 67.71 | -2.96% | +| Qwen2.5-7B-Instruct-quantized. | compressed-tensors | 1 | 8.14 | -1.60% | 64.64 | -3.56% | +| Llama-3.3-70B-Instruct-quantiz | compressed-tensors | 2 | 33.88 | +49.69% | 37.28 | -47.33% | +| Llama-3.1-8B-Instruct | fp8 | 1 | 8.49 | -11.91% | 64.61 | +2.11% | +| Meta-Llama-3.1-8B-Instruct-qua | gptq_marlin | 1 | 5.38 | -9.49% | 67.71 | -3.29% | **Conclusions**: @@ -220,11 +227,11 @@ | Model | kv_cache_dtype | Actual KV (GiB) | KV err | Actual tokens | Pred tokens | Token err | Conc err | |-------|:--------------:|:---------------:|:------:|:-------------:|:-----------:|:---------:|:--------:| -| Qwen2.5-7B-Instruct | auto | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | -4.22% | -| Qwen2.5-7B-Instruct | fp8 | 58.53 | -4.21% | 2,192,000 | 1,049,789 | -52.11% | -52.11% | +| Qwen2.5-7B-Instruct | auto | 58.53 | +11.92% | 1,096,000 | 2,453,196 | +123.83% | +123.83% | +| Qwen2.5-7B-Instruct | fp8 | 58.53 | -4.21% | 2,192,000 | 1,049,789 | -52.11% | -4.22% | ||||||||| | Llama-3.1-8B-Instruct | auto | 58.11 | -3.47% | 476,000 | 459,509 | -3.46% | -3.48% | -| Llama-3.1-8B-Instruct | fp8 | 58.11 | -3.47% | 952,032 | 459,509 | -51.73% | -51.73% | +| Llama-3.1-8B-Instruct | fp8 | 58.11 | -3.47% | 952,032 | 459,509 | -51.73% | -3.47% | ||||||||| **Conclusion**: `--kv-cache-dtype fp8` stores each KV element in 1 byte instead of 2 bytes (BF16/FP16), doubling the number of tokens that fit in the KV pool. The KV pool size in GiB is unaffected (same activation and weight overhead), so the **KV GiB error stays near −4%** (the same as the default-dtype baseline). But because the planner always computes per-token bytes from the model's native compute dtype, **token count and max-concurrency predictions are ~52% too low** for fp8-KV runs. This is a direct, fixable bug: the planner should accept `kv_cache_dtype` as an input parameter and apply 1 byte/token when it is `fp8`. @@ -238,16 +245,18 @@ The planner uses **fixed constants per architecture** (e.g., 4.8 GiB for Llama, | Architecture | Predicted (GiB) | Observed range (GiB) | Error range | |-------------|:---------------:|:--------------------:|:-----------:| | DeepseekV2 | 8.00 | 1.93–1.93 | +314.51% to +314.51% | -| Granite | 5.50 | 0.75–0.85 | +547.06% to +633.33% | -| KimiVL* | 8.00 | 2.85–2.92 | +173.97% to +180.70% | -| Llama | 4.80 | 1.05–2.21 | +117.19% to +357.14% | -| LlavaNext* | 2.50 | 0.79–0.79 | +216.46% to +216.46% | -| Mistral3* | 2.50 | 2.03–2.18 | +14.68% to +23.15% | -| Mixtral | 8.00 | 1.21–1.21 | +561.16% to +561.16% | -| Phi3 | 5.50 | 1.52–1.52 | +261.84% to +261.84% | -| Qwen2 | 5.60 | 2.21–2.29 | +144.54% to +153.39% | +| Gemma2 | 5.50 | 1.89–2.18 | +152.29% to +191.01% | +| Gemma3* | 5.50 | 1.89–2.21 | +148.87% to +191.01% | +| Granite | 5.50 | 3.62–3.66 | +50.27% to +51.93% | +| KimiVL* | 8.00 | 1.52–1.89 | +323.28% to +426.32% | +| Llama | 4.80 | 0.75–3.99 | +20.30% to +540.00% | +| LlavaNext* | 2.50 | 3.94–3.94 | -36.55% to -36.55% | +| Mistral3* | 2.50 | 1.05–2.21 | +13.12% to +138.10% | +| Mixtral | 8.00 | 1.89–1.89 | +323.28% to +323.28% | +| Phi3 | 5.50 | 1.10–1.10 | +400.00% to +400.00% | +| Qwen2 | 5.60 | 1.21–2.92 | +91.78% to +362.81% | | Qwen3 | 5.60 | 2.21–2.21 | +153.39% to +153.39% | -| Qwen3Moe | 8.00 | 2.68–2.68 | +198.51% to +198.51% | +| Qwen3Moe | 8.00 | 2.21–2.21 | +261.99% to +261.99% | The discrepancy suggests the constants were measured with an older vLLM version or different compilation settings. Re-calibrating to these v0.19.0 measurements would be the highest-value fix. @@ -255,11 +264,11 @@ The discrepancy suggests the constants were measured with an older vLLM version | TP | PP | Constant used | Actual mean (GiB) | Mean error | |:--:|:--:|:-------------:|:-----------------:|:----------:| -| 1 | 1 | 0.15 GiB | 0.27 | -42.23% | +| 1 | 1 | 0.15 GiB | 0.27 | -9.00% | | 1 | 2 | 0.15 GiB | 0.07 | +114.29% | | 1 | 4 | 0.15 GiB | 0.07 | +114.29% | -| 2 | 1 | 0.6 GiB | 2.08 | -71.17% | -| 4 | 1 | 0.6 GiB | 2.17 | -72.34% | +| 2 | 1 | 0.6 GiB | 2.08 | -85.58% | +| 4 | 1 | 0.6 GiB | 2.17 | -77.43% | For TP=1 the formula slightly under-estimates (0.15 vs ~0.25 GiB actual). For TP≥2, NCCL all-reduce buffers push actual non-torch to ~2.1 GiB — 3.5× the 0.60 GiB constant. For PP≥2, P2P send/receive adds overhead that the formula doesn't model at all. @@ -275,7 +284,7 @@ The planner uses 80 GiB (catalog) but H100 physical VRAM is 79.19 GiB: The planner returns 0.0 GiB for CUDA graphs (treating it as included in activation). vLLM allocates the CUDA graph pool *after* sizing the KV cache, so the reported KV pool includes CUDA graph memory. The formula is therefore consistent with the log-reported KV number — no fix needed, but it should be documented. -Observed CUDA graph pool sizes: 0.51–1.85 GiB (mean 1.03 GiB). +Observed CUDA graph pool sizes: 0.51–1.85 GiB (mean 1.02 GiB). ## Recommendations diff --git a/accuracy/results/v0.19.0/results_predicted.csv b/accuracy/results/v0.19.0/results_predicted.csv index be953156..16a719e0 100644 --- a/accuracy/results/v0.19.0/results_predicted.csv +++ b/accuracy/results/v0.19.0/results_predicted.csv @@ -1,4 +1,5 @@ model,gpu,tp,pp,dp,max_model_len,dtype,quantization,kv_cache_dtype,gpu_memory_utilization,architecture,attention_type,num_hidden_layers,num_kv_heads,head_dimension,kv_dtype_bytes,per_token_kv_bytes,per_token_kv_bytes_per_gpu,pred_weight_memory_gib,pred_activation_memory_gib,pred_non_torch_gib,pred_cuda_graph_gib,pred_total_non_kv_cache_gib,pred_kv_cache_memory_gib,pred_kv_cache_tokens,pred_max_concurrency,pred_total_weight_gib,pred_alloc_kv_total_gib +codellama/CodeLlama-7b-hf,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,LlamaForCausalLM,Multi-head attention,32,32,128,2,524288,524288.0,12.5515,4.8,0.15,0.0,17.5015,58.4985,119804,14.62,12.5515,58.4985 deepseek-ai/DeepSeek-V2-Lite-Chat,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,DeepseekV2ForCausalLM,Multi-head latent attention,27,16,128,2,31104,31104.0,29.2556,8.0,0.15,0.0,37.4056,38.5944,1332317,162.64,29.2556,38.5944 RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic,H100-80GB,2,1,1,8192,torch.bfloat16,compressed-tensors,auto,0.95,LlamaForCausalLM,Grouped-query attention,80,8,128,2,327680,163840.0,33.8395,4.8,0.6,0.0,39.2395,39.1605,256642,31.33,67.679,78.321 RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic,H100-80GB,4,1,1,8192,torch.bfloat16,compressed-tensors,auto,0.95,LlamaForCausalLM,Grouped-query attention,80,8,128,2,327680,81920.0,16.9198,4.8,0.6,0.0,22.3198,57.2802,750783,91.65,67.679,229.121 @@ -46,3 +47,9 @@ RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8,H100-80GB,1,1,1,8192 RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8,H100-80GB,2,1,1,8192,torch.bfloat16,compressed-tensors,auto,0.95,Mistral3ForConditionalGeneration,Grouped-query attention,40,8,128,2,163840,81920.0,12.0138,2.5,0.6,0.0,15.1138,62.1362,814431,99.42,24.0276,124.2724 RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8,H100-80GB,1,1,1,8192,torch.float16,compressed-tensors,auto,0.95,LlamaForCausalLM,Grouped-query attention,32,8,128,2,131072,131072.0,8.4601,4.8,0.15,0.0,13.4101,62.5899,512736,62.59,8.4601,62.5899 RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8,H100-80GB,1,1,1,8192,torch.bfloat16,compressed-tensors,auto,0.95,Qwen2ForCausalLM,Grouped-query attention,28,4,128,2,57344,57344.0,8.1106,5.6,0.15,0.0,13.8606,62.1394,1163533,142.03,8.1106,62.1394 +google/gemma-2-27b-it,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,Gemma2ForCausalLM,Grouped-query attention,46,16,128,2,376832,376832.0,50.7145,5.5,0.15,0.0,56.3645,19.6355,55949,6.83,50.7145,19.6355 +google/gemma-2-2b-it,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,Gemma2ForCausalLM,Grouped-query attention,26,4,256,2,106496,106496.0,4.8696,5.5,0.15,0.0,10.5196,65.4804,660203,80.59,4.8696,65.4804 +google/gemma-2-9b-it,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,Gemma2ForCausalLM,Grouped-query attention,42,8,256,2,344064,344064.0,17.214,5.5,0.15,0.0,22.864,53.136,165824,20.24,17.214,53.136 +google/gemma-3-12b-it,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,Gemma3ForConditionalGeneration,Grouped-query attention,48,8,256,2,393216,393216.0,22.7007,5.5,0.15,0.0,28.3507,47.6493,130114,15.88,22.7007,47.6493 +google/gemma-3-27b-it,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,Gemma3ForConditionalGeneration,Grouped-query attention,62,16,128,2,507904,507904.0,51.0968,5.5,0.15,0.0,56.7468,19.2532,40702,4.97,51.0968,19.2532 +google/gemma-3-4b-it,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,Gemma3ForConditionalGeneration,Grouped-query attention,34,4,256,2,139264,139264.0,8.0095,5.5,0.15,0.0,13.6595,62.3405,480652,58.67,8.0095,62.3405 diff --git a/accuracy/results/v0.19.0/results_raw.csv b/accuracy/results/v0.19.0/results_raw.csv index 632eec77..fe20222e 100644 --- a/accuracy/results/v0.19.0/results_raw.csv +++ b/accuracy/results/v0.19.0/results_raw.csv @@ -1,14 +1,22 @@ log_file,status,model,gpu,tp,pp,dp,max_model_len,dtype,quantization,kv_cache_dtype,gpu_memory_utilization,init_free_memory_gib,init_total_memory_gib,init_cuda_memory_gib,init_non_torch_memory_gib,requested_memory_gib,weight_memory_gib,weights_memory_gib,activation_memory_gib,non_torch_forward_gib,total_non_kv_cache_gib,cuda_graph_estimated_gib,cuda_graph_actual_gib,kv_cache_memory_gib,kv_cache_tokens,kv_cache_blocks,max_concurrency,summary_free_gib,summary_total_gib,cudagraph_piecewise_count,cudagraph_piecewise_largest,cudagraph_full_count +codellama-codellama-34b---h100-80gb--tp2pp1dp1--8192.FAILED.log,failed,codellama/CodeLlama-34b-hf,H100-80GB,2,1,1,8192,torch.bfloat16,None,auto,0.95,77.66,79.19,1.53,1.51,75.23,,,,,,,,,,,,,,,, +codellama-codellama-7b-h--h100-80gb--tp1pp1dp1--8192.log,ok,codellama/CodeLlama-7b-hf,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,12.56,12.56,0.77,0.25,13.57,0.84,0.89,61.66,126256,7891,15.41,78.68,79.19,51,512,51 deepseek-ai-deepseek-v2---h100-80gb--tp1pp1dp1--8192.log,ok,deepseek-ai/DeepSeek-V2-Lite-Chat,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,29.43,29.43,1.93,0.26,31.62,1.39,1.57,43.61,1505552,94097,183.78,78.68,79.19,51,512,51 fp8dyn-llama-3-3-70b--h100-80gb--tp2pp1dp1--8192.log,ok,RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic,H100-80GB,2,1,1,8192,torch.bfloat16,compressed-tensors,auto,0.95,77.66,79.19,1.53,1.51,75.23,33.88,33.88,1.96,2.1,37.95,1.82,1.79,37.28,244304,15269,29.82,77.66,79.19,51,512,51 fp8dyn-llama-3-3-70b--h100-80gb--tp4pp1dp1--8192.log,ok,RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic,H100-80GB,4,1,1,8192,torch.bfloat16,compressed-tensors,auto,0.95,77.64,79.19,1.55,1.53,75.23,16.96,16.96,1.97,2.21,21.14,1.7,1.66,54.09,708992,44312,86.55,77.64,79.19,51,512,51 fp8dyn-redhatai-qwen2-5---h100-80gb--tp1pp1dp1--8192.log,ok,RedHatAI/Qwen2.5-7B-Instruct-fp8-dynamic,H100-80GB,1,1,1,8192,torch.bfloat16,compressed-tensors,auto,0.95,78.68,79.19,0.51,0.51,75.23,8.14,8.14,2.21,0.24,10.59,0.71,0.81,64.64,1210304,75644,147.74,78.68,79.19,51,512,51 google-gemma-2-27b-it--h100-80gb--tp1pp1dp1--8192.FAILED.log,failed,,H100-80GB,,,,,,,,0.95,,,,,,,,,,,,,,,,,,,,, +google-gemma-2-27b-it--h100-80gb--tp1pp1dp1--8192.log,ok,google/gemma-2-27b-it,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,50.72,50.72,3.66,0.26,54.64,1.13,1.23,20.59,58672,7335,7.16,78.68,79.19,51,512,51 google-gemma-2-2b-it--h100-80gb--tp1pp1dp1--8192.FAILED.log,failed,,H100-80GB,,,,,,,,0.95,,,,,,,,,,,,,,,,,,,,, +google-gemma-2-2b-it--h100-80gb--tp1pp1dp1--8192.log,ok,google/gemma-2-2b-it,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,4.9,4.9,3.62,0.24,8.76,0.52,0.71,66.47,670160,83770,81.73,78.68,79.19,51,512,51 google-gemma-2-9b-it--h100-80gb--tp1pp1dp1--8192.FAILED.log,failed,,H100-80GB,,,,,,,,0.95,,,,,,,,,,,,,,,,,,,,, +google-gemma-2-9b-it--h100-80gb--tp1pp1dp1--8192.log,ok,google/gemma-2-9b-it,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,17.22,17.22,3.65,0.25,21.11,0.84,1.09,54.12,168880,21110,20.6,78.68,79.19,51,512,51 google-gemma-3-12b-it--h100-80gb--tp1pp1dp1--8192.FAILED.log,failed,,H100-80GB,,,,,,,,0.95,,,,,,,,,,,,,,,,,,,,, +google-gemma-3-12b-it--h100-80gb--tp1pp1dp1--8192.log,ok,google/gemma-3-12b-it,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,23.31,23.31,3.94,0.25,27.51,1.02,0.94,47.72,130304,48867,15.88,78.68,79.19,51,512,51 google-gemma-3-27b-it--h100-80gb--tp1pp1dp1--8192.FAILED.log,failed,,H100-80GB,,,,,,,,0.95,,,,,,,,,,,,,,,,,,,,, +google-gemma-3-27b-it--h100-80gb--tp1pp1dp1--8192.log,ok,google/gemma-3-27b-it,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,51.45,51.45,3.99,0.26,55.7,1.05,1.14,19.53,36560,15997,4.46,78.68,79.19,51,512,51 google-gemma-3-4b-it--h100-80gb--tp1pp1dp1--8192.FAILED.log,failed,,H100-80GB,,,,,,,,0.95,,,,,,,,,,,,,,,,,,,,, +google-gemma-3-4b-it--h100-80gb--tp1pp1dp1--8192.log,ok,google/gemma-3-4b-it,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,8.58,8.58,3.89,0.25,12.72,0.7,0.67,62.51,468144,204817,57.05,78.68,79.19,51,512,51 granite-3-1-2b--h100-80gb--tp1pp1dp1--8192.log,ok,ibm-granite/granite-3.1-2b-instruct,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,4.74,4.74,0.75,0.46,5.95,1.6,0.84,69.28,908048,56753,110.85,78.68,79.19,51,512,51 granite-3-1-8b--h100-80gb--tp1pp1dp1--8192.log,ok,ibm-granite/granite-3.1-8b-instruct,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,15.25,15.25,0.85,0.46,16.57,0.74,0.98,58.66,384432,24027,46.93,78.68,79.19,51,512,51 ibm-granite-granite-3-3---h100-80gb--tp1pp1dp1--8192.log,ok,ibm-granite/granite-3.3-8b-instruct,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,15.25,15.25,0.85,0.46,16.57,0.74,0.98,58.66,384432,24027,46.93,78.68,79.19,51,512,51 diff --git a/accuracy/results/v0.19.0/run_matrix.md b/accuracy/results/v0.19.0/run_matrix.md index 46bc15c1..40c7678c 100644 --- a/accuracy/results/v0.19.0/run_matrix.md +++ b/accuracy/results/v0.19.0/run_matrix.md @@ -1,6 +1,6 @@ # Run Matrix — vLLM v0.19.0 / H100-80GB -**47 successful runs, 11 failed runs.** +**52 successful runs, 6 failed runs.** Quantization abbreviations: `ct` = compressed-tensors, `gptq` = gptq_marlin, `fp8` = fp8 inline, `—` = none. @@ -8,10 +8,14 @@ Quantization abbreviations: `ct` = compressed-tensors, `gptq` = gptq_marlin, `fp | Model | TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight | Activation | Non-torch | KV cache | |---|---|---|---|---|---|---|---|---|---|---|---| +| codellama/CodeLlama-7b-hf | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.1% | +523.4% | -40.0% | -5.1% | | deepseek-ai/DeepSeek-V2-Lite-Chat | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.6% | +314.5% | -42.3% | -11.5% | -| RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic | 2 | 1 | 1 | 8192 | bf16 | ct | auto | -0.1% | +144.9% | -71.4% | +5.0% | -| RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic | 4 | 1 | 1 | 8192 | bf16 | ct | auto | -0.2% | +143.7% | -72.9% | +5.9% | -| RedHatAI/Qwen2.5-7B-Instruct-fp8-dynamic | 1 | 1 | 1 | 8192 | bf16 | ct | auto | -0.4% | +153.4% | -37.5% | -3.9% | +| google/gemma-2-27b-it | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.0% | +50.3% | -42.3% | -4.6% | +| google/gemma-2-2b-it | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.6% | +51.9% | -37.5% | -1.5% | +| google/gemma-2-9b-it | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.0% | +50.7% | -40.0% | -1.8% | +| google/gemma-3-12b-it | 1 | 1 | 1 | 8192 | bf16 | — | auto | -2.6% | +39.6% | -40.0% | -0.1% | +| google/gemma-3-27b-it | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.7% | +37.8% | -42.3% | -1.4% | +| google/gemma-3-4b-it | 1 | 1 | 1 | 8192 | bf16 | — | auto | -6.6% | +41.4% | -40.0% | -0.3% | | ibm-granite/granite-3.1-2b-instruct | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.4% | +633.3% | -67.4% | -5.3% | | ibm-granite/granite-3.1-8b-instruct | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.2% | +547.1% | -67.4% | -6.0% | | ibm-granite/granite-3.3-8b-instruct | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.2% | +547.1% | -67.4% | -6.0% | @@ -23,13 +27,12 @@ Quantization abbreviations: `ct` = compressed-tensors, `gptq` = gptq_marlin, `fp | meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 8192 | f32 | — | auto | -50.1% | +117.2% | -40.0% | +31.1% | | meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 2048 | bf16 | — | auto | -0.2% | +154.0% | -40.0% | -3.5% | | meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 4096 | bf16 | — | auto | -0.2% | +154.0% | -40.0% | -3.5% | -| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.2% | +154.0% | -40.0% | -3.5% | -| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 16384 | bf16 | — | auto | -0.2% | +154.0% | -40.0% | -3.5% | -| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 32768 | bf16 | — | auto | -0.2% | +154.0% | -40.0% | -3.5% | | meta-llama/Llama-3.1-8B-Instruct | 1 | 2 | 1 | 8192 | bf16 | — | auto | -0.4% | +336.4% | +114.3% | -0.9% | | meta-llama/Llama-3.1-8B-Instruct | 1 | 4 | 1 | 8192 | bf16 | — | auto | -12.2% | +357.1% | +114.3% | +1.6% | | meta-llama/Llama-3.1-8B-Instruct | 2 | 1 | 1 | 8192 | bf16 | — | auto | -0.4% | +154.0% | -71.0% | +2.8% | | meta-llama/Llama-3.1-8B-Instruct | 4 | 1 | 1 | 8192 | bf16 | — | auto | -0.8% | +154.0% | -71.8% | +4.5% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 16384 | bf16 | — | auto | -0.2% | +154.0% | -40.0% | -3.5% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 32768 | bf16 | — | auto | -0.2% | +154.0% | -40.0% | -3.5% | | microsoft/phi-4 | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.3% | +261.8% | -40.0% | -6.6% | | mistralai/Mistral-Small-3.1-24B-Instruct-2503 | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.1% | +23.2% | -40.0% | +1.5% | | mistralai/Mixtral-8x7B-Instruct-v0.1 | 2 | 1 | 1 | 8192 | bf16 | — | auto | -0.0% | +561.2% | -71.0% | -1.9% | @@ -37,39 +40,37 @@ Quantization abbreviations: `ct` = compressed-tensors, `gptq` = gptq_marlin, `fp | moonshotai/Kimi-Dev-72B | 4 | 1 | 1 | 8192 | bf16 | — | auto | -0.5% | +144.5% | -72.9% | +9.3% | | moonshotai/Kimi-VL-A3B-Instruct | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.6% | +174.0% | -40.0% | -9.8% | | moonshotai/Kimi-VL-A3B-Instruct | 2 | 1 | 1 | 8192 | bf16 | — | auto | -1.6% | +180.7% | -71.0% | +2.4% | +| Qwen/Qwen2.5-72B-Instruct | 2 | 1 | 1 | 8192 | bf16 | — | auto | -0.1% | +144.5% | -71.3% | +60.8% | | Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 8192 | bf16 | — | fp8 | -0.5% | +153.4% | -37.5% | -4.2% | | Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.5% | +153.4% | -37.5% | -4.2% | -| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 2048 | bf16 | — | auto | -0.5% | +153.4% | -37.5% | -4.2% | -| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 4096 | bf16 | — | auto | -0.5% | +153.4% | -37.5% | -4.2% | | Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 16384 | bf16 | — | auto | -0.5% | +153.4% | -37.5% | -4.2% | | Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 32768 | bf16 | — | auto | -0.5% | +153.4% | -37.5% | -4.2% | +| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 2048 | bf16 | — | auto | -0.5% | +153.4% | -37.5% | -4.2% | +| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 4096 | bf16 | — | auto | -0.5% | +153.4% | -37.5% | -4.2% | | Qwen/Qwen2.5-7B-Instruct | 2 | 1 | 1 | 8192 | bf16 | — | auto | -0.4% | +153.4% | -70.9% | +2.6% | | Qwen/Qwen2.5-7B-Instruct | 4 | 1 | 1 | 8192 | bf16 | — | auto | -0.1% | +153.4% | -71.8% | +4.6% | -| Qwen/Qwen2.5-72B-Instruct | 2 | 1 | 1 | 8192 | bf16 | — | auto | -0.1% | +144.5% | -71.3% | +60.8% | -| Qwen/Qwen3-8B | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.1% | +153.4% | -40.0% | -4.4% | | Qwen/Qwen3-30B-A3B | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.0% | +198.5% | -44.4% | -28.8% | +| Qwen/Qwen3-8B | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.1% | +153.4% | -40.0% | -4.4% | +| RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic | 2 | 1 | 1 | 8192 | bf16 | ct | auto | -0.1% | +144.9% | -71.4% | +5.0% | +| RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic | 4 | 1 | 1 | 8192 | bf16 | ct | auto | -0.2% | +143.7% | -72.9% | +5.9% | | redhatai/Llama-3.3-70B-Instruct-quantized.w8a8 | 2 | 1 | 1 | 8192 | bf16 | ct | auto | -0.1% | +144.9% | -71.6% | +5.0% | | RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16 | 1 | 1 | 1 | 8192 | f16 | gptq | auto | -0.7% | +154.0% | -40.0% | -3.0% | | RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8 | 1 | 1 | 1 | 8192 | f16 | ct | auto | -0.4% | +154.0% | -40.0% | -3.1% | | RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8 | 1 | 1 | 1 | 8192 | bf16 | ct | auto | -0.2% | +14.7% | -42.3% | +1.2% | | RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8 | 2 | 1 | 1 | 8192 | bf16 | ct | auto | -0.8% | +23.2% | -71.0% | +5.3% | +| RedHatAI/Qwen2.5-7B-Instruct-fp8-dynamic | 1 | 1 | 1 | 8192 | bf16 | ct | auto | -0.4% | +153.4% | -37.5% | -3.9% | | RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8 | 1 | 1 | 1 | 8192 | bf16 | ct | auto | -0.4% | +153.4% | -40.0% | -3.9% | ## Failed Runs | Model | TP | PP | DP | max_len | Notes | |---|---|---|---|---|---| -| google/gemma-2-2b-it | 1 | 1 | 1 | 8192 | | -| google/gemma-2-9b-it | 1 | 1 | 1 | 8192 | | -| google/gemma-2-27b-it | 1 | 1 | 1 | 8192 | | -| google/gemma-3-4b-it | 1 | 1 | 1 | 8192 | | -| google/gemma-3-12b-it | 1 | 1 | 1 | 8192 | | -| google/gemma-3-27b-it | 1 | 1 | 1 | 8192 | | +| codellama/CodeLlama-34b-hf | 2 | 1 | 1 | 8192 | GPU contention at runtime | | meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 2 | 8192 | DP=2 | -| meta-llama/Llama-4-Scout | 4 | 1 | 1 | 8192 | | -| moonshotai/Kimi-Dev-72B | 4 | 1 | 1 | 8192 | second attempt; TP=2 and TP=4 first attempt succeeded | -| openai/GPT-OSS-20B | 1 | 1 | 1 | 8192 | | -| Qwen/Qwen3-14B | 5 | 1 | 1 | 8192 | TP=5 (non-power-of-2) | +| meta-llama/Llama-4-Scout-17B-16E-Instruct | 4 | 1 | 1 | 8192 | | +| moonshotai/Kimi-Dev-72B | 4 | 1 | 1 | 8192 | second attempt; tp=2 succeeded | +| openai/gpt-oss-20b | 1 | 1 | 1 | 8192 | | +| Qwen/Qwen3-14B | 5 | 1 | 1 | 8192 | tp=5 invalid (vocab not divisible by 5) | ## Calibration decisions From c9f015d3d3bcb80b94e57f8121821c83c624a03e Mon Sep 17 00:00:00 2001 From: Jing Chen Date: Wed, 22 Apr 2026 14:55:39 -0400 Subject: [PATCH 06/24] Fix multiplier Signed-off-by: Jing Chen --- accuracy/results/v0.19.0/accuracy_report.md | 216 ++++++++++---------- accuracy/scripts/analyze.py | 22 +- accuracy/scripts/sweep-gemma.yaml | 35 ++++ accuracy/scripts/sweep_runner.py | 2 +- 4 files changed, 164 insertions(+), 111 deletions(-) create mode 100644 accuracy/scripts/sweep-gemma.yaml diff --git a/accuracy/results/v0.19.0/accuracy_report.md b/accuracy/results/v0.19.0/accuracy_report.md index c6172ae9..d4165ec5 100644 --- a/accuracy/results/v0.19.0/accuracy_report.md +++ b/accuracy/results/v0.19.0/accuracy_report.md @@ -8,18 +8,18 @@ | Metric | Mean error | Mean abs error | Notes | |--------|:----------:|:--------------:|-------| -| **KV Cache memory** (all 47 runs) | +71.11% | +100.71% | | -| **KV Cache memory** (baseline: tp=pp=1, len=8192, no-quant) | +40.12% | — | n=23 | -| **Weight memory** | +86.28% | +132.35% | From safetensors metadata | -| **Activation memory** | +188.51% | +189.86% | Largest error source | -| **Non-torch overhead** | -22.26% | +68.14% | | -| **Max concurrency** | +286.69% | +345.88% | Proxy for KV cache accuracy | +| **KV Cache memory** (all 47 runs) | +0.45% | +7.16% | | +| **KV Cache memory** (baseline: tp=pp=1, len=8192, no-quant) | -3.51% | — | n=23 | +| **Weight memory** | -0.22% | +3.04% | From safetensors metadata | +| **Activation memory** | +185.38% | +185.38% | Largest error source | +| **Non-torch overhead** | -43.23% | +51.70% | | +| **Max concurrency** | -1.34% | +14.34% | Proxy for KV cache accuracy | ### Key Findings -1. **Weights are accurate** — mean abs error +132.35%, computed directly from safetensors parameter counts. Errors arise only when `--dtype` overrides the native dtype (e.g., `--dtype float32`) or when quantization is not fully captured in the config. -2. **Activation is the dominant error source** — mean +188.51% (over-estimate). The planner uses empirical constants (4.8–8.0 GiB) measured at `max_model_len=16000`; vLLM v0.19.0 reports 0.75–2.2 GiB across all architectures tested. Granite is worst (+600%), Mistral3/Pixtral is best (+15–23%). -3. **Over-estimated activation partially cancels** the catalog GPU memory inflation (+0.77 GiB), leaving KV cache only +71.11% off on average across all runs. But this is coincidental cancellation of two large opposing errors, not model accuracy. +1. **Weights are accurate** — mean abs error +3.04%, computed directly from safetensors parameter counts. Errors arise only when `--dtype` overrides the native dtype (e.g., `--dtype float32`) or when quantization is not fully captured in the config. +2. **Activation is the dominant error source** — mean +185.38% (over-estimate). The planner uses empirical constants (4.8–8.0 GiB) measured at `max_model_len=16000`; vLLM v0.19.0 reports 0.75–2.2 GiB across all architectures tested. Granite is worst (+600%), Mistral3/Pixtral is best (+15–23%). +3. **Over-estimated activation partially cancels** the catalog GPU memory inflation (+0.77 GiB), leaving KV cache only +0.45% off on average across all runs. But this is coincidental cancellation of two large opposing errors, not model accuracy. 4. **Non-default KV dtype (`--kv-cache-dtype fp8`) doubles token capacity** but the planner ignores this flag — KV token count is off by ~2× for those runs. 5. **`--dtype float32` breaks weight prediction** — the planner uses the HuggingFace config dtype (BF16) and never sees the vLLM `--dtype` override, giving −50% weight error. 6. **Pipeline parallelism reduces actual activation** (each GPU processes fewer layers) but the formula uses the same constant regardless of PP, compounding the activation error. @@ -33,49 +33,49 @@ | Component | Mean error | Median | Mean abs | Min | Max | n | |-----------|:----------:|:------:|:--------:|:---:|:---:|:-:| -| Weight | +86.28% | -0.41% | +132.35% | -90.70% | +1696.10% | 54 | -| Activation | +188.51% | +153.39% | +189.86% | -36.55% | +540.00% | 54 | -| Non Torch | -22.26% | -40.00% | +68.14% | -93.21% | +150.00% | 54 | +| Weight | -0.22% | -0.33% | +3.04% | -50.11% | +76.18% | 54 | +| Activation | +185.38% | +153.39% | +185.38% | +14.68% | +633.33% | 54 | +| Non Torch | -43.23% | -40.00% | +51.70% | -72.85% | +114.29% | 54 | | Cuda Graph | -100.00% | -100.00% | +100.00% | -100.00% | -100.00% | 54 | -| Total Non Kv | +74.84% | +17.66% | +108.13% | -81.02% | +976.70% | 54 | -| Kv Cache | +71.11% | -3.47% | +100.71% | -92.75% | +1756.45% | 54 | -| Max Concurrency | +286.69% | -2.58% | +345.88% | -98.55% | +5217.43% | 54 | +| Total Non Kv | +18.33% | +16.28% | +20.02% | -38.61% | +97.49% | 54 | +| Kv Cache | +0.45% | -3.47% | +7.16% | -28.75% | +61.82% | 54 | +| Max Concurrency | -1.34% | -3.47% | +14.34% | -87.31% | +162.10% | 54 | ### Baseline: TP=1, PP=1, len=8192, no quantization, default KV dtype (n=23) | Component | Mean error | Median | Mean abs | Min | Max | n | |-----------|:----------:|:------:|:--------:|:---:|:---:|:-:| -| Weight | -6.31% | -44.21% | +64.61% | -90.70% | +215.56% | 23 | -| Activation | +206.57% | +153.39% | +209.75% | -36.55% | +540.00% | 23 | -| Non Torch | -19.94% | -40.00% | +57.33% | -67.39% | +150.00% | 23 | +| Weight | -2.82% | -0.22% | +2.82% | -50.11% | +0.04% | 23 | +| Activation | +206.53% | +153.97% | +206.53% | +23.15% | +633.33% | 23 | +| Non Torch | -43.74% | -40.00% | +43.74% | -67.39% | -37.50% | 23 | | Cuda Graph | -100.00% | -100.00% | +100.00% | -100.00% | -100.00% | 23 | -| Total Non Kv | +6.55% | -18.08% | +58.84% | -81.02% | +234.58% | 23 | -| Kv Cache | +40.12% | +8.07% | +48.56% | -19.04% | +306.16% | 23 | -| Max Concurrency | +219.13% | +104.27% | +247.44% | -81.60% | +1366.62% | 23 | +| Total Non Kv | +15.04% | +16.28% | +18.40% | -38.61% | +74.27% | 23 | +| Kv Cache | -3.51% | -4.21% | +6.34% | -28.75% | +31.06% | 23 | +| Max Concurrency | -0.47% | -4.22% | +15.94% | -87.31% | +162.10% | 23 | ### Multi-GPU (TP > 1 or PP > 1) (n=15) | Component | Mean error | Median | Mean abs | Min | Max | n | |-----------|:----------:|:------:|:--------:|:---:|:---:|:-:| -| Weight | +255.46% | +55.63% | +285.52% | -79.08% | +1696.10% | 15 | -| Activation | +174.46% | +144.54% | +174.46% | +13.12% | +400.00% | 15 | -| Non Torch | -56.76% | -72.85% | +87.23% | -93.21% | +114.29% | 15 | +| Weight | -1.20% | -0.38% | +1.20% | -12.22% | -0.03% | 15 | +| Activation | +196.02% | +153.39% | +196.02% | +23.15% | +561.16% | 15 | +| Non Torch | -46.75% | -71.29% | +77.23% | -72.85% | +114.29% | 15 | | Cuda Graph | -100.00% | -100.00% | +100.00% | -100.00% | -100.00% | 15 | -| Total Non Kv | +178.64% | +57.97% | +204.71% | -72.39% | +976.70% | 15 | -| Kv Cache | +209.57% | -19.27% | +265.99% | -92.75% | +1756.45% | 15 | -| Max Concurrency | +652.26% | -77.10% | +752.16% | -98.55% | +5217.43% | 15 | +| Total Non Kv | +16.86% | +11.34% | +17.76% | -6.76% | +97.49% | 15 | +| Kv Cache | +11.26% | +4.62% | +11.63% | -1.88% | +61.82% | 15 | +| Max Concurrency | +6.58% | +4.63% | +16.32% | -71.19% | +62.24% | 15 | ### Quantized Models (fp8-dynamic / w8a8 / w4a16) (n=10) | Component | Mean error | Median | Mean abs | Min | Max | n | |-----------|:----------:|:------:|:--------:|:---:|:---:|:-:| -| Weight | +58.68% | -0.30% | +69.12% | -28.48% | +501.85% | 10 | -| Activation | +163.06% | +153.68% | +163.06% | +143.65% | +191.01% | 10 | -| Non Torch | -56.97% | -41.15% | +56.97% | -92.89% | -37.50% | 10 | +| Weight | +7.29% | -0.29% | +7.94% | -0.79% | +76.18% | 10 | +| Activation | +124.00% | +149.15% | +124.00% | +14.68% | +153.97% | 10 | +| Non Torch | -52.67% | -41.15% | +52.67% | -72.85% | -37.50% | 10 | | Cuda Graph | -100.00% | -100.00% | +100.00% | -100.00% | -100.00% | 10 | -| Total Non Kv | +66.95% | +29.93% | +69.69% | -13.72% | +433.84% | 10 | -| Kv Cache | -12.54% | -3.43% | +16.96% | -70.20% | +9.04% | 10 | -| Max Concurrency | -23.06% | -25.97% | +49.90% | -92.31% | +104.21% | 10 | +| Total Non Kv | +21.87% | +15.87% | +23.22% | -6.76% | +87.45% | 10 | +| Kv Cache | -0.45% | -0.87% | +4.95% | -13.18% | +5.90% | 10 | +| Max Concurrency | -0.44% | -0.86% | +4.95% | -13.19% | +5.89% | 10 | ### Non-default KV cache dtype (--kv-cache-dtype fp8) (n=2) @@ -87,7 +87,7 @@ | Cuda Graph | -100.00% | -100.00% | +100.00% | -100.00% | -100.00% | 2 | | Total Non Kv | +17.83% | +17.83% | +17.83% | +16.28% | +19.37% | 2 | | Kv Cache | -3.84% | -3.84% | +3.84% | -4.21% | -3.47% | 2 | -| Max Concurrency | -3.84% | -3.84% | +3.84% | -4.22% | -3.47% | 2 | +| Max Concurrency | -51.92% | -51.92% | +51.92% | -52.11% | -51.73% | 2 | ## Per-Model Errors — Baseline Runs @@ -95,29 +95,29 @@ | Model | Arch | Weight err | Activation err | Non-torch err | KV cache err | Max conc err | |-------|------|:----------:|:--------------:|:-------------:|:------------:|:------------:| -| Qwen2.5-7B-Instruct | Qwen2 | -50.23% | +153.39% | +150.00% | +11.92% | +123.83% | -| Qwen2.5-7B-Instruct | Llama | -62.51% | +117.19% | -37.50% | +12.26% | -50.89% | -| Qwen3-30B-A3B | Llama | -85.13% | +79.10% | -44.44% | +306.16% | +204.72% | -| Qwen3-8B | Qwen2 | -46.89% | +153.39% | -40.00% | +8.07% | +177.89% | +| Qwen2.5-7B-Instruct | Qwen2 | -0.45% | +153.39% | -37.50% | -4.21% | -4.22% | +| Qwen2.5-7B-Instruct | Qwen2 | -0.45% | +153.39% | -37.50% | -4.21% | -4.22% | +| Qwen3-30B-A3B | Qwen3Moe | -0.02% | +198.51% | -44.44% | -28.75% | -28.72% | +| Qwen3-8B | Qwen3 | -0.09% | +153.39% | -40.00% | -4.36% | -4.36% | | CodeLlama-7b-hf | Llama | -0.07% | +523.38% | -40.00% | -5.13% | -5.13% | | DeepSeek-V2-Lite-Chat | DeepseekV2 | -0.59% | +314.51% | -42.31% | -11.50% | -11.50% | -| gemma-2-27b-it | Granite | -90.70% | +50.27% | -42.31% | +218.75% | +1366.62% | -| gemma-2-2b-it | Granite | +210.60% | +51.93% | -37.50% | -17.06% | -46.04% | -| gemma-2-9b-it | Granite | -11.62% | +50.68% | -40.00% | +1.87% | +114.08% | -| gemma-3-12b-it | LlavaNext* | -76.22% | -36.55% | -40.00% | +42.10% | +583.19% | -| gemma-3-27b-it | Llama | -70.93% | +20.30% | -42.31% | +187.21% | +1157.62% | -| gemma-3-4b-it | Llama | +74.33% | +23.39% | -40.00% | -10.27% | -1.68% | -| granite-3.1-2b-instruct | Llama | +215.56% | +540.00% | -67.39% | -19.04% | -49.40% | -| granite-3.1-8b-instruct | Llama | -1.92% | +464.71% | -67.39% | -4.38% | +19.52% | -| granite-3.3-8b-instruct | Llama | -1.92% | +464.71% | -67.39% | -4.38% | +19.52% | -| granite-vision-3.3-2b | Llama | +169.99% | +507.59% | -40.00% | -18.29% | +104.27% | +| gemma-2-27b-it | Gemma2 | -0.01% | +50.27% | -42.31% | -4.64% | -4.61% | +| gemma-2-2b-it | Gemma2 | -0.62% | +51.93% | -37.50% | -1.49% | -1.39% | +| gemma-2-9b-it | Gemma2 | -0.03% | +50.68% | -40.00% | -1.82% | -1.75% | +| gemma-3-12b-it | Gemma3* | -2.61% | +39.59% | -40.00% | -0.15% | +0.00% | +| gemma-3-27b-it | Gemma3* | -0.69% | +37.84% | -42.31% | -1.42% | +11.43% | +| gemma-3-4b-it | Gemma3* | -6.65% | +41.39% | -40.00% | -0.27% | +2.84% | +| granite-3.1-2b-instruct | Granite | -0.44% | +633.33% | -67.39% | -5.27% | -5.27% | +| granite-3.1-8b-instruct | Granite | -0.20% | +547.06% | -67.39% | -6.02% | -6.03% | +| granite-3.3-8b-instruct | Granite | -0.20% | +547.06% | -67.39% | -6.02% | -6.03% | +| granite-vision-3.3-2b | LlavaNext* | +0.04% | +216.46% | -40.00% | -1.23% | -1.23% | | Llama-3.1-8B-Instruct | Llama | -0.22% | +153.97% | -40.00% | -3.47% | -3.48% | -| Llama-3.1-8B-Instruct | Llama | -75.05% | +153.97% | -40.00% | +22.03% | +388.11% | -| Llama-3.1-8B-Instruct | Llama | -75.05% | +117.19% | +140.00% | +53.09% | +512.34% | -| Llama-3.1-8B-Instruct | Llama | -0.22% | +153.97% | -40.00% | -3.47% | -75.87% | -| phi-4 | KimiVL* | -44.21% | +426.32% | +140.00% | +21.79% | +125.53% | -| Mistral-Small-3.1-24B-Instruct-2503 | Qwen2 | -68.31% | +175.86% | -40.00% | +98.88% | +468.29% | -| Kimi-VL-A3B-Instruct | Qwen2 | -53.85% | +91.78% | -40.00% | +35.68% | -81.60% | +| Llama-3.1-8B-Instruct | Llama | -0.22% | +153.97% | -40.00% | -3.47% | -3.48% | +| Llama-3.1-8B-Instruct | Llama | -50.11% | +117.19% | -40.00% | +31.06% | +162.10% | +| Llama-3.1-8B-Instruct | Llama | -0.22% | +153.97% | -40.00% | -3.47% | -3.48% | +| phi-4 | Phi3 | -0.31% | +261.84% | -40.00% | -6.59% | -6.58% | +| Mistral-Small-3.1-24B-Instruct-2503 | Mistral3* | -0.08% | +23.15% | -40.00% | +1.54% | +1.55% | +| Kimi-VL-A3B-Instruct | KimiVL* | -0.58% | +173.97% | -40.00% | -9.76% | -87.31% | ## Argument Sensitivity Analysis @@ -127,20 +127,20 @@ | Model | max_model_len | Actual KV (GiB) | KV err | Actual tokens | Pred tokens | Token err | Max conc err | |-------|:-------------:|:---------------:|:------:|:-------------:|:-----------:|:---------:|:------------:| -| Llama-3.1-8B-Instruct | 2,048 | 58.11 | +21.25% | 476,016 | 2,308,853 | +385.04% | +21.26% | -| Llama-3.1-8B-Instruct | 4,096 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | -75.86% | +| Llama-3.1-8B-Instruct | 2,048 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | -3.47% | +| Llama-3.1-8B-Instruct | 4,096 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | -3.47% | | Llama-3.1-8B-Instruct | 8,192 | 58.11 | -3.47% | 476,000 | 459,509 | -3.46% | -3.48% | -| Llama-3.1-8B-Instruct | 8,192 | 58.11 | +22.03% | 476,016 | 2,323,599 | +388.13% | +388.11% | -| Llama-3.1-8B-Instruct | 8,192 | 42.80 | +53.09% | 175,296 | 1,073,499 | +512.39% | +512.34% | -| Llama-3.1-8B-Instruct | 8,192 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | -75.87% | -| Llama-3.1-8B-Instruct | 16,384 | 58.11 | -30.92% | 476,016 | 526,169 | +10.54% | +121.10% | -| Llama-3.1-8B-Instruct | 32,768 | 58.11 | -35.83% | 476,016 | 181,017 | -61.97% | +52.10% | -| Qwen2.5-7B-Instruct | 2,048 | 58.53 | -6.04% | 1,096,000 | 400,450 | -63.46% | -90.87% | -| Qwen2.5-7B-Instruct | 4,096 | 58.53 | -33.09% | 1,096,000 | 256,642 | -76.58% | -88.29% | -| Qwen2.5-7B-Instruct | 8,192 | 58.53 | +11.92% | 1,096,000 | 2,453,196 | +123.83% | +123.83% | -| Qwen2.5-7B-Instruct | 8,192 | 58.53 | +12.26% | 1,096,000 | 538,282 | -50.89% | -50.89% | -| Qwen2.5-7B-Instruct | 16,384 | 58.53 | +20.37% | 1,095,968 | 5,276,861 | +381.48% | +863.00% | -| Qwen2.5-7B-Instruct | 32,768 | 58.53 | -81.24% | 1,096,000 | 119,925 | -89.06% | -56.23% | +| Llama-3.1-8B-Instruct | 8,192 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | -3.48% | +| Llama-3.1-8B-Instruct | 8,192 | 42.80 | +31.06% | 175,296 | 459,509 | +162.13% | +162.10% | +| Llama-3.1-8B-Instruct | 8,192 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | -3.48% | +| Llama-3.1-8B-Instruct | 16,384 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | -3.44% | +| Llama-3.1-8B-Instruct | 32,768 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | -3.51% | +| Qwen2.5-7B-Instruct | 2,048 | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | -4.22% | +| Qwen2.5-7B-Instruct | 4,096 | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | -4.22% | +| Qwen2.5-7B-Instruct | 8,192 | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | -4.22% | +| Qwen2.5-7B-Instruct | 8,192 | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | -4.22% | +| Qwen2.5-7B-Instruct | 16,384 | 58.53 | -4.21% | 1,095,968 | 1,049,789 | -4.21% | -4.22% | +| Qwen2.5-7B-Instruct | 32,768 | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | -4.22% | **Conclusion**: `--max-model-len` has **no effect on KV pool size** — the formula and vLLM agree on this. Activation memory is constant (the fixed profiling overhead does not depend on context length), so the KV pool prediction error stays flat at ~−3 to −4% regardless of whether context is 2 K or 32 K tokens. The token/concurrency predictions carry that same constant KV error forward, plus any error from the per-token KV formula. @@ -149,15 +149,15 @@ | Model | TP | Actual weight (GiB) | Weight err | Actual activ (GiB) | Activation err | Actual non-torch (GiB) | Non-torch err | KV cache err | |-------|:--:|:-------------------:|:----------:|:------------------:|:--------------:|:---------------------:|:-------------:|:------------:| | Llama-3.1-8B-Instruct | 1 | 14.99 | -0.22% | 1.89 | +153.97% | 0.25 | -40.00% | -3.47% | -| Llama-3.1-8B-Instruct | 1 | 14.99 | -75.05% | 1.89 | +153.97% | 0.25 | -40.00% | +22.03% | -| Llama-3.1-8B-Instruct | 1 | 29.98 | -75.05% | 2.21 | +117.19% | 0.25 | +140.00% | +53.09% | | Llama-3.1-8B-Instruct | 1 | 14.99 | -0.22% | 1.89 | +153.97% | 0.25 | -40.00% | -3.47% | -| Llama-3.1-8B-Instruct | 2 | 7.51 | +479.17% | 1.89 | +323.28% | 2.07 | -71.01% | -56.23% | -| Llama-3.1-8B-Instruct | 4 | 3.77 | +1696.10% | 1.89 | +196.30% | 2.13 | -71.83% | -92.75% | -| Qwen2.5-7B-Instruct | 1 | 14.25 | -50.23% | 2.21 | +153.39% | 0.24 | +150.00% | +11.92% | -| Qwen2.5-7B-Instruct | 1 | 14.25 | -62.51% | 2.21 | +117.19% | 0.24 | -37.50% | +12.26% | -| Qwen2.5-7B-Instruct | 2 | 7.12 | +237.47% | 2.21 | +13.12% | 2.06 | -92.72% | -22.74% | -| Qwen2.5-7B-Instruct | 4 | 3.55 | +238.42% | 2.21 | +13.12% | 2.13 | -71.83% | -7.73% | +| Llama-3.1-8B-Instruct | 1 | 29.98 | -50.11% | 2.21 | +117.19% | 0.25 | -40.00% | +31.06% | +| Llama-3.1-8B-Instruct | 1 | 14.99 | -0.22% | 1.89 | +153.97% | 0.25 | -40.00% | -3.47% | +| Llama-3.1-8B-Instruct | 2 | 7.51 | -0.42% | 1.89 | +153.97% | 2.07 | -71.01% | +2.76% | +| Llama-3.1-8B-Instruct | 4 | 3.77 | -0.81% | 1.89 | +153.97% | 2.13 | -71.83% | +4.48% | +| Qwen2.5-7B-Instruct | 1 | 14.25 | -0.45% | 2.21 | +153.39% | 0.24 | -37.50% | -4.21% | +| Qwen2.5-7B-Instruct | 1 | 14.25 | -0.45% | 2.21 | +153.39% | 0.24 | -37.50% | -4.21% | +| Qwen2.5-7B-Instruct | 2 | 7.12 | -0.38% | 2.21 | +153.39% | 2.06 | -70.87% | +2.61% | +| Qwen2.5-7B-Instruct | 4 | 3.55 | -0.10% | 2.21 | +153.39% | 2.13 | -71.83% | +4.62% | **Conclusions**: @@ -170,11 +170,11 @@ | PP | Actual weight (GiB) | Weight err | Actual activ (GiB) | Activation err | Actual non-torch (GiB) | Non-torch err | KV cache err | |:--:|:-------------------:|:----------:|:------------------:|:--------------:|:---------------------:|:-------------:|:------------:| | 1 | 14.99 | -0.22% | 1.89 | +153.97% | 0.25 | -40.00% | -3.47% | -| 1 | 14.99 | -75.05% | 1.89 | +153.97% | 0.25 | -40.00% | +22.03% | -| 1 | 29.98 | -75.05% | 2.21 | +117.19% | 0.25 | +140.00% | +53.09% | | 1 | 14.99 | -0.22% | 1.89 | +153.97% | 0.25 | -40.00% | -3.47% | -| 2 | 7.51 | +263.59% | 1.10 | +400.00% | 0.07 | +114.29% | -35.31% | -| 4 | 4.26 | +949.87% | 1.05 | +138.10% | 0.07 | +114.29% | -58.99% | +| 1 | 29.98 | -50.11% | 2.21 | +117.19% | 0.25 | -40.00% | +31.06% | +| 1 | 14.99 | -0.22% | 1.89 | +153.97% | 0.25 | -40.00% | -3.47% | +| 2 | 7.51 | -0.42% | 1.10 | +336.36% | 0.07 | +114.29% | -0.85% | +| 4 | 4.26 | -12.22% | 1.05 | +357.14% | 0.07 | +114.29% | +1.59% | **Conclusions**: @@ -189,11 +189,11 @@ | bfloat16 | None | auto | 14.99 | -0.22% | 58.11 | -3.47% | | bfloat16 | None | auto | 14.99 | -0.22% | 58.11 | -3.47% | | bfloat16 | None | fp8 | 14.99 | -0.22% | 58.11 | -3.47% | -| bfloat16 | fp8 | auto | 8.49 | -11.91% | 64.61 | +2.11% | -| float16 | None | auto | 14.99 | -75.05% | 58.11 | +22.03% | -| float16 | compressed-tensors | auto | 8.49 | +501.85% | 64.60 | -70.20% | -| float16 | gptq_marlin | auto | 5.38 | -9.49% | 67.71 | -3.29% | -| float32 | None | auto | 29.98 | -75.05% | 42.80 | +53.09% | +| bfloat16 | fp8 | auto | 8.49 | +76.18% | 64.61 | -13.18% | +| float16 | None | auto | 14.99 | -0.22% | 58.11 | -3.47% | +| float16 | compressed-tensors | auto | 8.49 | -0.35% | 64.60 | -3.11% | +| float16 | gptq_marlin | auto | 5.38 | -0.71% | 67.71 | -2.96% | +| float32 | None | auto | 29.98 | -50.11% | 42.80 | +31.06% | **Conclusions**: @@ -208,14 +208,14 @@ |-------|--------------|----|:-------------------:|:----------:|:---------------:|:------:| | Llama-3.3-70B-Instruct-fp8-dyn | compressed-tensors | 2 | 33.88 | -0.12% | 37.28 | +5.04% | | Llama-3.3-70B-Instruct-fp8-dyn | compressed-tensors | 4 | 16.96 | -0.24% | 54.09 | +5.90% | -| Meta-Llama-3.1-8B-Instruct-qua | compressed-tensors | 1 | 8.49 | +501.85% | 64.60 | -70.20% | -| Mistral-Small-3.1-24B-Instruct | compressed-tensors | 1 | 24.07 | -28.48% | 48.73 | +9.04% | -| Mistral-Small-3.1-24B-Instruct | compressed-tensors | 2 | 12.11 | +87.45% | 59.02 | -19.27% | +| Meta-Llama-3.1-8B-Instruct-qua | compressed-tensors | 1 | 8.49 | -0.35% | 64.60 | -3.11% | +| Mistral-Small-3.1-24B-Instruct | compressed-tensors | 1 | 24.07 | -0.18% | 48.73 | +1.22% | +| Mistral-Small-3.1-24B-Instruct | compressed-tensors | 2 | 12.11 | -0.79% | 59.02 | +5.28% | | Qwen2.5-7B-Instruct-fp8-dynami | compressed-tensors | 1 | 8.14 | -0.36% | 64.64 | -3.87% | -| Qwen2.5-7B-Instruct-quantized. | compressed-tensors | 1 | 8.14 | -1.60% | 64.64 | -3.56% | -| Llama-3.3-70B-Instruct-quantiz | compressed-tensors | 2 | 33.88 | +49.69% | 37.28 | -47.33% | -| Llama-3.1-8B-Instruct | fp8 | 1 | 8.49 | -11.91% | 64.61 | +2.11% | -| Meta-Llama-3.1-8B-Instruct-qua | gptq_marlin | 1 | 5.38 | -9.49% | 67.71 | -3.29% | +| Qwen2.5-7B-Instruct-quantized. | compressed-tensors | 1 | 8.14 | -0.36% | 64.64 | -3.87% | +| Llama-3.3-70B-Instruct-quantiz | compressed-tensors | 2 | 33.88 | -0.12% | 37.28 | +5.04% | +| Llama-3.1-8B-Instruct | fp8 | 1 | 8.49 | +76.18% | 64.61 | -13.18% | +| Meta-Llama-3.1-8B-Instruct-qua | gptq_marlin | 1 | 5.38 | -0.71% | 67.71 | -2.96% | **Conclusions**: @@ -227,11 +227,11 @@ | Model | kv_cache_dtype | Actual KV (GiB) | KV err | Actual tokens | Pred tokens | Token err | Conc err | |-------|:--------------:|:---------------:|:------:|:-------------:|:-----------:|:---------:|:--------:| -| Qwen2.5-7B-Instruct | auto | 58.53 | +11.92% | 1,096,000 | 2,453,196 | +123.83% | +123.83% | -| Qwen2.5-7B-Instruct | fp8 | 58.53 | -4.21% | 2,192,000 | 1,049,789 | -52.11% | -4.22% | +| Qwen2.5-7B-Instruct | auto | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | -4.22% | +| Qwen2.5-7B-Instruct | fp8 | 58.53 | -4.21% | 2,192,000 | 1,049,789 | -52.11% | -52.11% | ||||||||| | Llama-3.1-8B-Instruct | auto | 58.11 | -3.47% | 476,000 | 459,509 | -3.46% | -3.48% | -| Llama-3.1-8B-Instruct | fp8 | 58.11 | -3.47% | 952,032 | 459,509 | -51.73% | -3.47% | +| Llama-3.1-8B-Instruct | fp8 | 58.11 | -3.47% | 952,032 | 459,509 | -51.73% | -51.73% | ||||||||| **Conclusion**: `--kv-cache-dtype fp8` stores each KV element in 1 byte instead of 2 bytes (BF16/FP16), doubling the number of tokens that fit in the KV pool. The KV pool size in GiB is unaffected (same activation and weight overhead), so the **KV GiB error stays near −4%** (the same as the default-dtype baseline). But because the planner always computes per-token bytes from the model's native compute dtype, **token count and max-concurrency predictions are ~52% too low** for fp8-KV runs. This is a direct, fixable bug: the planner should accept `kv_cache_dtype` as an input parameter and apply 1 byte/token when it is `fp8`. @@ -245,18 +245,18 @@ The planner uses **fixed constants per architecture** (e.g., 4.8 GiB for Llama, | Architecture | Predicted (GiB) | Observed range (GiB) | Error range | |-------------|:---------------:|:--------------------:|:-----------:| | DeepseekV2 | 8.00 | 1.93–1.93 | +314.51% to +314.51% | -| Gemma2 | 5.50 | 1.89–2.18 | +152.29% to +191.01% | -| Gemma3* | 5.50 | 1.89–2.21 | +148.87% to +191.01% | -| Granite | 5.50 | 3.62–3.66 | +50.27% to +51.93% | -| KimiVL* | 8.00 | 1.52–1.89 | +323.28% to +426.32% | -| Llama | 4.80 | 0.75–3.99 | +20.30% to +540.00% | -| LlavaNext* | 2.50 | 3.94–3.94 | -36.55% to -36.55% | -| Mistral3* | 2.50 | 1.05–2.21 | +13.12% to +138.10% | -| Mixtral | 8.00 | 1.89–1.89 | +323.28% to +323.28% | -| Phi3 | 5.50 | 1.10–1.10 | +400.00% to +400.00% | -| Qwen2 | 5.60 | 1.21–2.92 | +91.78% to +362.81% | +| Gemma2 | 5.50 | 3.62–3.66 | +50.27% to +51.93% | +| Gemma3* | 5.50 | 3.89–3.99 | +37.84% to +41.39% | +| Granite | 5.50 | 0.75–0.85 | +547.06% to +633.33% | +| KimiVL* | 8.00 | 2.85–2.92 | +173.97% to +180.70% | +| Llama | 4.80 | 0.77–2.21 | +117.19% to +523.38% | +| LlavaNext* | 2.50 | 0.79–0.79 | +216.46% to +216.46% | +| Mistral3* | 2.50 | 2.03–2.18 | +14.68% to +23.15% | +| Mixtral | 8.00 | 1.21–1.21 | +561.16% to +561.16% | +| Phi3 | 5.50 | 1.52–1.52 | +261.84% to +261.84% | +| Qwen2 | 5.60 | 2.21–2.29 | +144.54% to +153.39% | | Qwen3 | 5.60 | 2.21–2.21 | +153.39% to +153.39% | -| Qwen3Moe | 8.00 | 2.21–2.21 | +261.99% to +261.99% | +| Qwen3Moe | 8.00 | 2.68–2.68 | +198.51% to +198.51% | The discrepancy suggests the constants were measured with an older vLLM version or different compilation settings. Re-calibrating to these v0.19.0 measurements would be the highest-value fix. @@ -264,11 +264,11 @@ The discrepancy suggests the constants were measured with an older vLLM version | TP | PP | Constant used | Actual mean (GiB) | Mean error | |:--:|:--:|:-------------:|:-----------------:|:----------:| -| 1 | 1 | 0.15 GiB | 0.27 | -9.00% | +| 1 | 1 | 0.15 GiB | 0.27 | -41.88% | | 1 | 2 | 0.15 GiB | 0.07 | +114.29% | | 1 | 4 | 0.15 GiB | 0.07 | +114.29% | -| 2 | 1 | 0.6 GiB | 2.08 | -85.58% | -| 4 | 1 | 0.6 GiB | 2.17 | -77.43% | +| 2 | 1 | 0.6 GiB | 2.08 | -71.17% | +| 4 | 1 | 0.6 GiB | 2.17 | -72.34% | For TP=1 the formula slightly under-estimates (0.15 vs ~0.25 GiB actual). For TP≥2, NCCL all-reduce buffers push actual non-torch to ~2.1 GiB — 3.5× the 0.60 GiB constant. For PP≥2, P2P send/receive adds overhead that the formula doesn't model at all. diff --git a/accuracy/scripts/analyze.py b/accuracy/scripts/analyze.py index ffd8d12a..377fed6c 100644 --- a/accuracy/scripts/analyze.py +++ b/accuracy/scripts/analyze.py @@ -65,8 +65,26 @@ def fv(v: float, d: int = 2) -> str: raw_ok = [r for r in csv.DictReader(RAW_CSV.open()) if r["status"] == "ok"] pred_all = list(csv.DictReader(PRED_CSV.open())) -assert len(raw_ok) == len(pred_all) -pairs = list(zip(raw_ok, pred_all)) + +def _row_key(r: dict) -> tuple: + return (r["model"], r["tp"], r["pp"], r["dp"], r["max_model_len"], + r["dtype"], r.get("quantization", ""), r.get("kv_cache_dtype", "auto")) + +pred_map: dict[tuple, list] = {} +for p in pred_all: + pred_map.setdefault(_row_key(p), []).append(p) + +# Consume predictions in order; each raw row pops one matching prediction. +pairs: list[tuple] = [] +_counts: dict[tuple, int] = {} +for raw in raw_ok: + k = _row_key(raw) + bucket = pred_map.get(k, []) + idx = _counts.get(k, 0) + if idx < len(bucket): + pairs.append((raw, bucket[idx])) + _counts[k] = idx + 1 + # rows with no matching prediction are silently skipped # ── Per-row error calculation ───────────────────────────────────────────────── diff --git a/accuracy/scripts/sweep-gemma.yaml b/accuracy/scripts/sweep-gemma.yaml new file mode 100644 index 00000000..2d2df8d9 --- /dev/null +++ b/accuracy/scripts/sweep-gemma.yaml @@ -0,0 +1,35 @@ +defaults: + gpu: H100-80GB + gpu_memory_utilization: "0.95" + max_model_len: 8192 + pp: 1 + dp: 1 + dtype: auto + kv_cache_dtype: auto + quantization: null + vllm_image: vllm/vllm-openai:v0.19.0 + namespace: llmdplanner + results_pvc: vllm-mem-data + hf_token_secret: hf-token-gemma + node_selector: + nvidia.com/gpu.product: NVIDIA-H100-80GB-HBM3 + kubernetes.io/hostname: pokprod-b93r38s0 # only node with confirmed internet egress + +runs: + - model: google/gemma-2-2b-it # 2.6B dense; Gemma2 architecture + tp: 1 + + - model: google/gemma-2-9b-it # 9.2B dense; Gemma2 architecture + tp: 1 + + - model: google/gemma-2-27b-it # 27.2B dense; ~54 GiB bf16 + tp: 1 + + - model: google/gemma-3-4b-it # 4B dense; Gemma3 architecture + tp: 1 + + - model: google/gemma-3-12b-it # 12B dense; Gemma3 architecture + tp: 1 + + - model: google/gemma-3-27b-it # 27B dense; Gemma3 architecture + tp: 1 diff --git a/accuracy/scripts/sweep_runner.py b/accuracy/scripts/sweep_runner.py index 9edab422..2f9ce3f9 100644 --- a/accuracy/scripts/sweep_runner.py +++ b/accuracy/scripts/sweep_runner.py @@ -120,7 +120,7 @@ def _build_job_manifest(run_id: str, run: dict[str, Any]) -> dict[str, Any]: ], "env": [ {"name": "HF_TOKEN", "valueFrom": - {"secretKeyRef": {"name": "hf-token", "key": "token"}}}, + {"secretKeyRef": {"name": run.get("hf_token_secret", "hf-token"), "key": "token"}}}, {"name": "HF_HOME", "value": "/data/models"}, {"name": "HOME", "value": "/data"}, {"name": "XDG_CACHE_HOME", "value": "/data/.cache"}, From fbd68515a903397a1b785a2189dc7512b0d55680 Mon Sep 17 00:00:00 2001 From: Jing Chen Date: Wed, 22 Apr 2026 15:24:01 -0400 Subject: [PATCH 07/24] Clean up report Signed-off-by: Jing Chen --- accuracy/results/v0.19.0/accuracy_report.md | 329 ++++------ accuracy/scripts/analyze.py | 633 ++++++++++---------- accuracy/scripts/sweep.yaml | 29 + 3 files changed, 447 insertions(+), 544 deletions(-) diff --git a/accuracy/results/v0.19.0/accuracy_report.md b/accuracy/results/v0.19.0/accuracy_report.md index d4165ec5..3242079d 100644 --- a/accuracy/results/v0.19.0/accuracy_report.md +++ b/accuracy/results/v0.19.0/accuracy_report.md @@ -1,102 +1,37 @@ # Capacity Planner Accuracy Report — vLLM v0.19.0 / H100-80GB -**Dataset**: 54 successful runs across 29 unique models -**Hardware**: H100-80GB (catalog memory = 80 GiB, actual = ~79.19 GiB) -**Planner GPU util**: actual `gpu_memory_utilization` per run (0.95) +**Hardware**: H100-80GB (catalog 80 GiB, physical 79.19 GiB) +**Planner inputs evaluated**: model, tp, pp, dp, max_model_len, gpu_memory_utilization -## Executive Summary +> Percent error = (predicted − actual) / actual × 100. Positive = over-estimate, negative = under-estimate. -| Metric | Mean error | Mean abs error | Notes | -|--------|:----------:|:--------------:|-------| -| **KV Cache memory** (all 47 runs) | +0.45% | +7.16% | | -| **KV Cache memory** (baseline: tp=pp=1, len=8192, no-quant) | -3.51% | — | n=23 | -| **Weight memory** | -0.22% | +3.04% | From safetensors metadata | -| **Activation memory** | +185.38% | +185.38% | Largest error source | -| **Non-torch overhead** | -43.23% | +51.70% | | -| **Max concurrency** | -1.34% | +14.34% | Proxy for KV cache accuracy | +## Part 1: Accuracy Evaluation -### Key Findings +Covers 50 runs across 29 models using only parameters the planner currently accepts as inputs. Excludes runs with `--dtype float32`, runtime `--quantization fp8`, and `--kv-cache-dtype fp8` (see Part 2). -1. **Weights are accurate** — mean abs error +3.04%, computed directly from safetensors parameter counts. Errors arise only when `--dtype` overrides the native dtype (e.g., `--dtype float32`) or when quantization is not fully captured in the config. -2. **Activation is the dominant error source** — mean +185.38% (over-estimate). The planner uses empirical constants (4.8–8.0 GiB) measured at `max_model_len=16000`; vLLM v0.19.0 reports 0.75–2.2 GiB across all architectures tested. Granite is worst (+600%), Mistral3/Pixtral is best (+15–23%). -3. **Over-estimated activation partially cancels** the catalog GPU memory inflation (+0.77 GiB), leaving KV cache only +0.45% off on average across all runs. But this is coincidental cancellation of two large opposing errors, not model accuracy. -4. **Non-default KV dtype (`--kv-cache-dtype fp8`) doubles token capacity** but the planner ignores this flag — KV token count is off by ~2× for those runs. -5. **`--dtype float32` breaks weight prediction** — the planner uses the HuggingFace config dtype (BF16) and never sees the vLLM `--dtype` override, giving −50% weight error. -6. **Pipeline parallelism reduces actual activation** (each GPU processes fewer layers) but the formula uses the same constant regardless of PP, compounding the activation error. +### Summary -## Component-Level Error Breakdown +| Metric | Mean error | Mean abs error | n | +|--------|:----------:|:--------------:|:-:| +| KV cache memory (all runs) | +0.28% | +6.70% | 50 | +| KV cache memory (baseline: tp=pp=dp=1, len=8192, no quant) | -5.29% | — | 19 | +| Weight memory | -0.75% | +0.75% | 50 | +| Activation memory | +188.64% | +188.64% | 50 | +| Non-torch overhead | -43.54% | — | 50 | +| Max concurrency | -2.35% | +9.90% | 50 | -> Percent error = (predicted − actual) / actual × 100. Positive = over-estimate, negative = under-estimate. +**Key findings**: +- **Weight memory is accurate**: mean abs error +0.75%, computed directly from safetensors parameter counts. +- **KV cache memory is close**: +0.28% mean error across all runs; -5.29% at baseline. Errors are small and consistent. +- **Activation is the dominant error source**: mean +188.64% (over-estimate). The planner uses empirical constants measured against an older vLLM version; v0.19.0 reports substantially lower values. See Root Cause Analysis. +- **Max concurrency tracks KV accuracy**: -2.35% mean error; deviations come from the per-token KV formula, not the pool size prediction. -### All 47 Runs (n=54) - -| Component | Mean error | Median | Mean abs | Min | Max | n | -|-----------|:----------:|:------:|:--------:|:---:|:---:|:-:| -| Weight | -0.22% | -0.33% | +3.04% | -50.11% | +76.18% | 54 | -| Activation | +185.38% | +153.39% | +185.38% | +14.68% | +633.33% | 54 | -| Non Torch | -43.23% | -40.00% | +51.70% | -72.85% | +114.29% | 54 | -| Cuda Graph | -100.00% | -100.00% | +100.00% | -100.00% | -100.00% | 54 | -| Total Non Kv | +18.33% | +16.28% | +20.02% | -38.61% | +97.49% | 54 | -| Kv Cache | +0.45% | -3.47% | +7.16% | -28.75% | +61.82% | 54 | -| Max Concurrency | -1.34% | -3.47% | +14.34% | -87.31% | +162.10% | 54 | - -### Baseline: TP=1, PP=1, len=8192, no quantization, default KV dtype (n=23) - -| Component | Mean error | Median | Mean abs | Min | Max | n | -|-----------|:----------:|:------:|:--------:|:---:|:---:|:-:| -| Weight | -2.82% | -0.22% | +2.82% | -50.11% | +0.04% | 23 | -| Activation | +206.53% | +153.97% | +206.53% | +23.15% | +633.33% | 23 | -| Non Torch | -43.74% | -40.00% | +43.74% | -67.39% | -37.50% | 23 | -| Cuda Graph | -100.00% | -100.00% | +100.00% | -100.00% | -100.00% | 23 | -| Total Non Kv | +15.04% | +16.28% | +18.40% | -38.61% | +74.27% | 23 | -| Kv Cache | -3.51% | -4.21% | +6.34% | -28.75% | +31.06% | 23 | -| Max Concurrency | -0.47% | -4.22% | +15.94% | -87.31% | +162.10% | 23 | - -### Multi-GPU (TP > 1 or PP > 1) (n=15) - -| Component | Mean error | Median | Mean abs | Min | Max | n | -|-----------|:----------:|:------:|:--------:|:---:|:---:|:-:| -| Weight | -1.20% | -0.38% | +1.20% | -12.22% | -0.03% | 15 | -| Activation | +196.02% | +153.39% | +196.02% | +23.15% | +561.16% | 15 | -| Non Torch | -46.75% | -71.29% | +77.23% | -72.85% | +114.29% | 15 | -| Cuda Graph | -100.00% | -100.00% | +100.00% | -100.00% | -100.00% | 15 | -| Total Non Kv | +16.86% | +11.34% | +17.76% | -6.76% | +97.49% | 15 | -| Kv Cache | +11.26% | +4.62% | +11.63% | -1.88% | +61.82% | 15 | -| Max Concurrency | +6.58% | +4.63% | +16.32% | -71.19% | +62.24% | 15 | - -### Quantized Models (fp8-dynamic / w8a8 / w4a16) (n=10) - -| Component | Mean error | Median | Mean abs | Min | Max | n | -|-----------|:----------:|:------:|:--------:|:---:|:---:|:-:| -| Weight | +7.29% | -0.29% | +7.94% | -0.79% | +76.18% | 10 | -| Activation | +124.00% | +149.15% | +124.00% | +14.68% | +153.97% | 10 | -| Non Torch | -52.67% | -41.15% | +52.67% | -72.85% | -37.50% | 10 | -| Cuda Graph | -100.00% | -100.00% | +100.00% | -100.00% | -100.00% | 10 | -| Total Non Kv | +21.87% | +15.87% | +23.22% | -6.76% | +87.45% | 10 | -| Kv Cache | -0.45% | -0.87% | +4.95% | -13.18% | +5.90% | 10 | -| Max Concurrency | -0.44% | -0.86% | +4.95% | -13.19% | +5.89% | 10 | - -### Non-default KV cache dtype (--kv-cache-dtype fp8) (n=2) - -| Component | Mean error | Median | Mean abs | Min | Max | n | -|-----------|:----------:|:------:|:--------:|:---:|:---:|:-:| -| Weight | -0.34% | -0.34% | +0.34% | -0.45% | -0.22% | 2 | -| Activation | +153.68% | +153.68% | +153.68% | +153.39% | +153.97% | 2 | -| Non Torch | -38.75% | -38.75% | +38.75% | -40.00% | -37.50% | 2 | -| Cuda Graph | -100.00% | -100.00% | +100.00% | -100.00% | -100.00% | 2 | -| Total Non Kv | +17.83% | +17.83% | +17.83% | +16.28% | +19.37% | 2 | -| Kv Cache | -3.84% | -3.84% | +3.84% | -4.21% | -3.47% | 2 | -| Max Concurrency | -51.92% | -51.92% | +51.92% | -52.11% | -51.73% | 2 | - -## Per-Model Errors — Baseline Runs - -> TP=1, PP=1, max_model_len=8192, no quantization, default KV dtype. +### Per-Model Results — Baseline (TP=1, PP=1, DP=1, len=8192, no quantization) | Model | Arch | Weight err | Activation err | Non-torch err | KV cache err | Max conc err | |-------|------|:----------:|:--------------:|:-------------:|:------------:|:------------:| | Qwen2.5-7B-Instruct | Qwen2 | -0.45% | +153.39% | -37.50% | -4.21% | -4.22% | -| Qwen2.5-7B-Instruct | Qwen2 | -0.45% | +153.39% | -37.50% | -4.21% | -4.22% | | Qwen3-30B-A3B | Qwen3Moe | -0.02% | +198.51% | -44.44% | -28.75% | -28.72% | | Qwen3-8B | Qwen3 | -0.09% | +153.39% | -40.00% | -4.36% | -4.36% | | CodeLlama-7b-hf | Llama | -0.07% | +523.38% | -40.00% | -5.13% | -5.13% | @@ -112,144 +47,70 @@ | granite-3.3-8b-instruct | Granite | -0.20% | +547.06% | -67.39% | -6.02% | -6.03% | | granite-vision-3.3-2b | LlavaNext* | +0.04% | +216.46% | -40.00% | -1.23% | -1.23% | | Llama-3.1-8B-Instruct | Llama | -0.22% | +153.97% | -40.00% | -3.47% | -3.48% | -| Llama-3.1-8B-Instruct | Llama | -0.22% | +153.97% | -40.00% | -3.47% | -3.48% | -| Llama-3.1-8B-Instruct | Llama | -50.11% | +117.19% | -40.00% | +31.06% | +162.10% | -| Llama-3.1-8B-Instruct | Llama | -0.22% | +153.97% | -40.00% | -3.47% | -3.48% | | phi-4 | Phi3 | -0.31% | +261.84% | -40.00% | -6.59% | -6.58% | | Mistral-Small-3.1-24B-Instruct-2503 | Mistral3* | -0.08% | +23.15% | -40.00% | +1.54% | +1.55% | | Kimi-VL-A3B-Instruct | KimiVL* | -0.58% | +173.97% | -40.00% | -9.76% | -87.31% | -## Argument Sensitivity Analysis - -> This section examines how each vLLM launch argument affects whether the capacity planner's memory predictions remain accurate. - -### `--max-model-len` (context window size) - -| Model | max_model_len | Actual KV (GiB) | KV err | Actual tokens | Pred tokens | Token err | Max conc err | -|-------|:-------------:|:---------------:|:------:|:-------------:|:-----------:|:---------:|:------------:| -| Llama-3.1-8B-Instruct | 2,048 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | -3.47% | -| Llama-3.1-8B-Instruct | 4,096 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | -3.47% | -| Llama-3.1-8B-Instruct | 8,192 | 58.11 | -3.47% | 476,000 | 459,509 | -3.46% | -3.48% | -| Llama-3.1-8B-Instruct | 8,192 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | -3.48% | -| Llama-3.1-8B-Instruct | 8,192 | 42.80 | +31.06% | 175,296 | 459,509 | +162.13% | +162.10% | -| Llama-3.1-8B-Instruct | 8,192 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | -3.48% | -| Llama-3.1-8B-Instruct | 16,384 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | -3.44% | -| Llama-3.1-8B-Instruct | 32,768 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | -3.51% | -| Qwen2.5-7B-Instruct | 2,048 | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | -4.22% | -| Qwen2.5-7B-Instruct | 4,096 | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | -4.22% | -| Qwen2.5-7B-Instruct | 8,192 | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | -4.22% | -| Qwen2.5-7B-Instruct | 8,192 | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | -4.22% | -| Qwen2.5-7B-Instruct | 16,384 | 58.53 | -4.21% | 1,095,968 | 1,049,789 | -4.21% | -4.22% | -| Qwen2.5-7B-Instruct | 32,768 | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | -4.22% | - -**Conclusion**: `--max-model-len` has **no effect on KV pool size** — the formula and vLLM agree on this. Activation memory is constant (the fixed profiling overhead does not depend on context length), so the KV pool prediction error stays flat at ~−3 to −4% regardless of whether context is 2 K or 32 K tokens. The token/concurrency predictions carry that same constant KV error forward, plus any error from the per-token KV formula. - -### `--tensor-parallel-size` (TP) +### Sensitivity: Tensor Parallelism (TP) | Model | TP | Actual weight (GiB) | Weight err | Actual activ (GiB) | Activation err | Actual non-torch (GiB) | Non-torch err | KV cache err | |-------|:--:|:-------------------:|:----------:|:------------------:|:--------------:|:---------------------:|:-------------:|:------------:| | Llama-3.1-8B-Instruct | 1 | 14.99 | -0.22% | 1.89 | +153.97% | 0.25 | -40.00% | -3.47% | -| Llama-3.1-8B-Instruct | 1 | 14.99 | -0.22% | 1.89 | +153.97% | 0.25 | -40.00% | -3.47% | -| Llama-3.1-8B-Instruct | 1 | 29.98 | -50.11% | 2.21 | +117.19% | 0.25 | -40.00% | +31.06% | -| Llama-3.1-8B-Instruct | 1 | 14.99 | -0.22% | 1.89 | +153.97% | 0.25 | -40.00% | -3.47% | | Llama-3.1-8B-Instruct | 2 | 7.51 | -0.42% | 1.89 | +153.97% | 2.07 | -71.01% | +2.76% | | Llama-3.1-8B-Instruct | 4 | 3.77 | -0.81% | 1.89 | +153.97% | 2.13 | -71.83% | +4.48% | | Qwen2.5-7B-Instruct | 1 | 14.25 | -0.45% | 2.21 | +153.39% | 0.24 | -37.50% | -4.21% | -| Qwen2.5-7B-Instruct | 1 | 14.25 | -0.45% | 2.21 | +153.39% | 0.24 | -37.50% | -4.21% | | Qwen2.5-7B-Instruct | 2 | 7.12 | -0.38% | 2.21 | +153.39% | 2.06 | -70.87% | +2.61% | | Qwen2.5-7B-Instruct | 4 | 3.55 | -0.10% | 2.21 | +153.39% | 2.13 | -71.83% | +4.62% | -**Conclusions**: +- **Weights scale correctly** with TP: error stays near 0% across TP=1–4. +- **Activation is TP-invariant** in both formula and vLLM: error stays flat. +- **Non-torch is under-estimated at TP≥2**: NCCL all-reduce buffers push actual to ~2.1 GiB/GPU but the constant is 0.60 GiB. The opposing over-estimate in activation partially masks this in the KV cache error. -- **Weights scale correctly**: the formula divides by TP, matching vLLM's per-GPU weight sharding. Weight error stays near 0% across TP=1–4. -- **Activation is TP-invariant in both formula and reality**: vLLM's profiling overhead does not shrink with TP (it captures the same set of batch sizes). The formula also keeps activation constant with TP. Error stays flat. -- **Non-torch is heavily under-estimated for TP≥2**: the 0.60 GiB/GPU constant does not capture NCCL all-reduce buffer overhead, which grows with TP. Actual non-torch reaches ~2.1 GiB/GPU at TP=4 (3.5× the constant). However, this error is partially masked in KV cache accuracy because the over-estimated activation pulls the prediction in the opposite direction. +### Sensitivity: Pipeline Parallelism (PP) -### `--pipeline-parallel-size` (PP) +Model: meta-llama/Llama-3.1-8B-Instruct, TP=1, len=8192 | PP | Actual weight (GiB) | Weight err | Actual activ (GiB) | Activation err | Actual non-torch (GiB) | Non-torch err | KV cache err | |:--:|:-------------------:|:----------:|:------------------:|:--------------:|:---------------------:|:-------------:|:------------:| | 1 | 14.99 | -0.22% | 1.89 | +153.97% | 0.25 | -40.00% | -3.47% | -| 1 | 14.99 | -0.22% | 1.89 | +153.97% | 0.25 | -40.00% | -3.47% | -| 1 | 29.98 | -50.11% | 2.21 | +117.19% | 0.25 | -40.00% | +31.06% | -| 1 | 14.99 | -0.22% | 1.89 | +153.97% | 0.25 | -40.00% | -3.47% | | 2 | 7.51 | -0.42% | 1.10 | +336.36% | 0.07 | +114.29% | -0.85% | | 4 | 4.26 | -12.22% | 1.05 | +357.14% | 0.07 | +114.29% | +1.59% | -**Conclusions**: - -- **Activation drops sharply with PP**: at PP=1, vLLM profiles 1.89 GiB of activation; at PP=2 it drops to 1.10 GiB; at PP=4 to 1.05 GiB. Each pipeline stage runs fewer transformer layers, so the profiling sweep allocates proportionally less. The formula does not account for this and always predicts 4.80 GiB, making the activation error grow with PP (from ~+154% at PP=1 to ~+357% at PP=4). -- **Non-torch increases with PP** due to inter-stage P2P send/receive buffers, but the formula uses the same TP=1 constant (0.15 GiB/GPU) regardless of PP, causing the non-torch estimate to overshoot actual (predicted > actual for PP>1 because each stage is a separate process and 0.15 is per-GPU). These two errors partially offset each other in the KV cache prediction. -- **Weight error grows with PP**: the formula divides only by TP×PP for weight sharding, but with PP=4, model layers are not uniformly distributed across stages in all cases (irregular last-stage allocation can leave a stage with fewer params). - -### `--dtype` (compute/storage dtype override) +- **Activation drops with PP**: PP=1 → 1.89 GiB, PP=2 → 1.10 GiB, PP=4 → 1.05 GiB. The formula always predicts 4.80 GiB regardless of PP. +- **Weight error grows with PP**: layer imbalance across stages causes the formula (which assumes uniform distribution) to deviate at high PP. -| dtype arg | quantization | kv_cache_dtype | Actual weight (GiB) | Weight err | Actual KV (GiB) | KV err | -|-----------|:------------:|:--------------:|:-------------------:|:----------:|:---------------:|:------:| -| bfloat16 | None | auto | 14.99 | -0.22% | 58.11 | -3.47% | -| bfloat16 | None | auto | 14.99 | -0.22% | 58.11 | -3.47% | -| bfloat16 | None | fp8 | 14.99 | -0.22% | 58.11 | -3.47% | -| bfloat16 | fp8 | auto | 8.49 | +76.18% | 64.61 | -13.18% | -| float16 | None | auto | 14.99 | -0.22% | 58.11 | -3.47% | -| float16 | compressed-tensors | auto | 8.49 | -0.35% | 64.60 | -3.11% | -| float16 | gptq_marlin | auto | 5.38 | -0.71% | 67.71 | -2.96% | -| float32 | None | auto | 29.98 | -50.11% | 42.80 | +31.06% | +### Sensitivity: Context Length (max_model_len) -**Conclusions**: +| Model | max_model_len | Actual KV (GiB) | KV err | Actual tokens | Pred tokens | Token err | +|-------|:-------------:|:---------------:|:------:|:-------------:|:-----------:|:---------:| +| Llama-3.1-8B-Instruct | 2,048 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | +| Llama-3.1-8B-Instruct | 4,096 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | +| Llama-3.1-8B-Instruct | 8,192 | 58.11 | -3.47% | 476,000 | 459,509 | -3.46% | +| Llama-3.1-8B-Instruct | 16,384 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | +| Llama-3.1-8B-Instruct | 32,768 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | +| Qwen2.5-7B-Instruct | 2,048 | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | +| Qwen2.5-7B-Instruct | 4,096 | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | +| Qwen2.5-7B-Instruct | 8,192 | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | +| Qwen2.5-7B-Instruct | 16,384 | 58.53 | -4.21% | 1,095,968 | 1,049,789 | -4.21% | +| Qwen2.5-7B-Instruct | 32,768 | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | -- **`--dtype float32`** doubles model weight memory (29.98 GiB vs BF16's 14.99 GiB). The planner reads the HuggingFace config dtype (BF16) and is unaware of the `--dtype` vLLM override, so it predicts 14.96 GiB — a **−50% weight error**, which cascades into a +31% KV cache over-prediction (the planner thinks there is more room than there is). -- **`--dtype float16`** is handled correctly because the HuggingFace config also stores float16 for these models; weight error stays near 0%. -- **FP8-dynamic quantization** (`fp8` in the quantization column) halves weight memory. The planner reads `quantization_config` from the HuggingFace repo and applies the FP8 byte-per-param, yielding near-zero weight error. KV cache error stays consistent with the activation over-estimation. -- **`--kv-cache-dtype fp8`** does not affect weight or activation predictions, but halves per-token KV storage. The planner ignores this flag and predicts KV tokens ~50% too low (see dedicated section below). +- **KV pool size (GiB) is independent of max_model_len**: both formula and vLLM agree. The pool is sized from available memory, not from a pre-allocated token count. +- **Token count predictions vary**: the per-token KV bytes formula has model-dependent errors that show up consistently across all context lengths. -### `--quantization` (weight quantization method) +### Root Cause Analysis -| Model | quant method | TP | Actual weight (GiB) | Weight err | Actual KV (GiB) | KV err | -|-------|--------------|----|:-------------------:|:----------:|:---------------:|:------:| -| Llama-3.3-70B-Instruct-fp8-dyn | compressed-tensors | 2 | 33.88 | -0.12% | 37.28 | +5.04% | -| Llama-3.3-70B-Instruct-fp8-dyn | compressed-tensors | 4 | 16.96 | -0.24% | 54.09 | +5.90% | -| Meta-Llama-3.1-8B-Instruct-qua | compressed-tensors | 1 | 8.49 | -0.35% | 64.60 | -3.11% | -| Mistral-Small-3.1-24B-Instruct | compressed-tensors | 1 | 24.07 | -0.18% | 48.73 | +1.22% | -| Mistral-Small-3.1-24B-Instruct | compressed-tensors | 2 | 12.11 | -0.79% | 59.02 | +5.28% | -| Qwen2.5-7B-Instruct-fp8-dynami | compressed-tensors | 1 | 8.14 | -0.36% | 64.64 | -3.87% | -| Qwen2.5-7B-Instruct-quantized. | compressed-tensors | 1 | 8.14 | -0.36% | 64.64 | -3.87% | -| Llama-3.3-70B-Instruct-quantiz | compressed-tensors | 2 | 33.88 | -0.12% | 37.28 | +5.04% | -| Llama-3.1-8B-Instruct | fp8 | 1 | 8.49 | +76.18% | 64.61 | -13.18% | -| Meta-Llama-3.1-8B-Instruct-qua | gptq_marlin | 1 | 5.38 | -0.71% | 67.71 | -2.96% | +#### 1. Activation Constants Are Stale -**Conclusions**: +The planner uses fixed constants per architecture (e.g., 4.8 GiB for Llama) calibrated against an older vLLM version. vLLM v0.19.0 reports substantially lower values: -- **w8a8 (compressed-tensors INT8)**: the planner parses `config_groups` from the `quantization_config` to find `num_bits=8` and applies 1 byte/param. Weight errors are near zero (−0.3 to −0.7%), indicating the INT8 parameter count is well-captured. -- **w4a16 (GPTQ-marlin INT4)**: the planner parses `num_bits=4` from the quantization config and applies 0.5 bytes/param. Weight error is small (~−0.7%). The large reduction in weights (5.3 GiB vs 15 GiB for BF16) frees more KV cache, and the planner correctly tracks this effect — KV error stays in the −3% range. -- **fp8-dynamic** (fp8 per-tensor dynamic quant via `compressed-tensors`): the planner extracts fp8 precision from the quantization config. Weight error is near zero. Unexpectedly, weight error for the RedHat fp8 70B model at TP=2 stays very low, confirming the quant config parsing is correct for this variant. - -### `--kv-cache-dtype` (KV cache precision) - -| Model | kv_cache_dtype | Actual KV (GiB) | KV err | Actual tokens | Pred tokens | Token err | Conc err | -|-------|:--------------:|:---------------:|:------:|:-------------:|:-----------:|:---------:|:--------:| -| Qwen2.5-7B-Instruct | auto | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | -4.22% | -| Qwen2.5-7B-Instruct | fp8 | 58.53 | -4.21% | 2,192,000 | 1,049,789 | -52.11% | -52.11% | -||||||||| -| Llama-3.1-8B-Instruct | auto | 58.11 | -3.47% | 476,000 | 459,509 | -3.46% | -3.48% | -| Llama-3.1-8B-Instruct | fp8 | 58.11 | -3.47% | 952,032 | 459,509 | -51.73% | -51.73% | -||||||||| - -**Conclusion**: `--kv-cache-dtype fp8` stores each KV element in 1 byte instead of 2 bytes (BF16/FP16), doubling the number of tokens that fit in the KV pool. The KV pool size in GiB is unaffected (same activation and weight overhead), so the **KV GiB error stays near −4%** (the same as the default-dtype baseline). But because the planner always computes per-token bytes from the model's native compute dtype, **token count and max-concurrency predictions are ~52% too low** for fp8-KV runs. This is a direct, fixable bug: the planner should accept `kv_cache_dtype` as an input parameter and apply 1 byte/token when it is `fp8`. - -## Root Cause Analysis - -### 1. Activation Memory — Largest Error Source - -The planner uses **fixed constants per architecture** (e.g., 4.8 GiB for Llama, 5.6 GiB for Qwen2/3) empirically measured at `max_model_len=16000`. vLLM v0.19.0 reports substantially lower values during its profiling phase: - -| Architecture | Predicted (GiB) | Observed range (GiB) | Error range | -|-------------|:---------------:|:--------------------:|:-----------:| +| Architecture | Planner constant (GiB) | Observed v0.19.0 range (GiB) | Error range | +|-------------|:---------------------:|:----------------------------:|:-----------:| | DeepseekV2 | 8.00 | 1.93–1.93 | +314.51% to +314.51% | | Gemma2 | 5.50 | 3.62–3.66 | +50.27% to +51.93% | | Gemma3* | 5.50 | 3.89–3.99 | +37.84% to +41.39% | | Granite | 5.50 | 0.75–0.85 | +547.06% to +633.33% | | KimiVL* | 8.00 | 2.85–2.92 | +173.97% to +180.70% | -| Llama | 4.80 | 0.77–2.21 | +117.19% to +523.38% | +| Llama | 4.80 | 0.77–1.97 | +143.65% to +523.38% | | LlavaNext* | 2.50 | 0.79–0.79 | +216.46% to +216.46% | | Mistral3* | 2.50 | 2.03–2.18 | +14.68% to +23.15% | | Mixtral | 8.00 | 1.21–1.21 | +561.16% to +561.16% | @@ -258,41 +119,79 @@ The planner uses **fixed constants per architecture** (e.g., 4.8 GiB for Llama, | Qwen3 | 5.60 | 2.21–2.21 | +153.39% to +153.39% | | Qwen3Moe | 8.00 | 2.68–2.68 | +198.51% to +198.51% | -The discrepancy suggests the constants were measured with an older vLLM version or different compilation settings. Re-calibrating to these v0.19.0 measurements would be the highest-value fix. +Re-calibrating these constants from the v0.19.0 measurements is the highest-value fix. + +#### 2. Non-torch Constants Under-estimated for Multi-GPU + +| TP | PP | Constant used (GiB) | Observed mean (GiB) | Mean error | +|:--:|:--:|:-------------------:|:-------------------:|:----------:| +| 1 | 1 | 0.15 | 0.27 | -42.17% | +| 1 | 2 | 0.15 | 0.07 | +114.29% | +| 1 | 4 | 0.15 | 0.07 | +114.29% | +| 2 | 1 | 0.6 | 2.08 | -71.17% | +| 4 | 1 | 0.6 | 2.17 | -72.34% | + +TP≥2 requires NCCL all-reduce buffers (~2.1 GiB/GPU vs the 0.60 GiB constant). PP≥2 adds P2P send/receive buffers that the formula ignores entirely. + +#### 3. GPU Catalog vs Physical Memory + +Planner uses 80 GiB (catalog); H100 physical VRAM is 79.19 GiB. +Effect: KV pool over-predicted by ~0.77 GiB (76.00 vs 75.23 GiB at 0.95 utilization). + +#### 4. CUDA Graph Memory + +Observed pool sizes: 0.51–1.85 GiB (mean 1.03 GiB). vLLM allocates CUDA graphs after sizing the KV cache, so the reported KV pool already includes CUDA graph memory — no formula correction needed. + +--- + +## Part 2: Next Steps — Parameters Not Yet Modeled + +The following vLLM flags affect memory allocation but are not yet accepted as planner inputs. Each subsection quantifies the prediction gap to inform which inputs to add next. -### 2. Non-torch Memory — Underestimated for Multi-GPU +### `--kv-cache-dtype fp8` -| TP | PP | Constant used | Actual mean (GiB) | Mean error | -|:--:|:--:|:-------------:|:-----------------:|:----------:| -| 1 | 1 | 0.15 GiB | 0.27 | -41.88% | -| 1 | 2 | 0.15 GiB | 0.07 | +114.29% | -| 1 | 4 | 0.15 GiB | 0.07 | +114.29% | -| 2 | 1 | 0.6 GiB | 2.08 | -71.17% | -| 4 | 1 | 0.6 GiB | 2.17 | -72.34% | +| Model | kv_cache_dtype | Actual KV (GiB) | KV GiB err | Actual tokens | Pred tokens | Token err | +|-------|:--------------:|:---------------:|:----------:|:-------------:|:-----------:|:---------:| +| Qwen2.5-7B-Instruct | auto | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | +| Qwen2.5-7B-Instruct | fp8 | 58.53 | -4.21% | 2,192,000 | 1,049,789 | -52.11% | +|||||||| +| Llama-3.1-8B-Instruct | auto | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | +| Llama-3.1-8B-Instruct | fp8 | 58.11 | -3.47% | 952,032 | 459,509 | -51.73% | +|||||||| -For TP=1 the formula slightly under-estimates (0.15 vs ~0.25 GiB actual). For TP≥2, NCCL all-reduce buffers push actual non-torch to ~2.1 GiB — 3.5× the 0.60 GiB constant. For PP≥2, P2P send/receive adds overhead that the formula doesn't model at all. +**KV pool size (GiB) is unaffected** — fp8 halves per-token storage, not the pool. The planner's GiB prediction stays accurate. **Token count is ~2× too low** because the planner always uses the model's native dtype (BF16 = 2 bytes/element) instead of fp8 (1 byte/element). Fix: accept `kv_cache_dtype` as input; when `fp8`, use 1 byte/token. -### 3. GPU Memory Catalog vs Physical +### `--dtype` override -The planner uses 80 GiB (catalog) but H100 physical VRAM is 79.19 GiB: +| dtype | Actual weight (GiB) | Weight err | Actual KV (GiB) | KV err | +|-------|:-------------------:|:----------:|:---------------:|:------:| +| bfloat16 | 14.99 | -0.22% | 58.11 | -3.47% | +| bfloat16 | 14.99 | -0.22% | 58.11 | -3.47% | +| bfloat16 | 14.99 | -0.22% | 58.11 | -3.47% | +| float16 | 14.99 | -0.22% | 58.11 | -3.47% | +| float32 | 29.98 | -50.11% | 42.80 | +31.06% | -- Catalog available: 80 × 0.95 = **76.00 GiB** -- Physical available: 79.19 × 0.95 = **75.23 GiB** -- Systematic KV over-prediction from this source alone: **+0.77 GiB** +**`--dtype float32`** doubles weight memory. The planner reads the HF config dtype (BF16) and has no visibility into the vLLM override → −50% weight error, +31% KV error. +**`--dtype float16`** matches the HF config for these models → near-zero error. +Fix: accept `dtype` as input and use it to override the bytes-per-param calculation. -### 4. CUDA Graph Memory — Excluded from Formula +### Runtime `--quantization fp8` -The planner returns 0.0 GiB for CUDA graphs (treating it as included in activation). vLLM allocates the CUDA graph pool *after* sizing the KV cache, so the reported KV pool includes CUDA graph memory. The formula is therefore consistent with the log-reported KV number — no fix needed, but it should be documented. +| Model | Actual weight (GiB) | Weight err | Actual KV (GiB) | KV err | +|-------|:-------------------:|:----------:|:---------------:|:------:| +| Llama-3.1-8B-Instruct | 8.49 | +76.18% | 64.61 | -13.18% | -Observed CUDA graph pool sizes: 0.51–1.85 GiB (mean 1.02 GiB). +Runtime `--quantization fp8` compresses weights on-the-fly after loading. vLLM logs the post-compression size (~half of BF16). The planner finds no `quantization_config` in the HF repo and predicts the full BF16 weight → ~+76% weight error. +Fix: accept `quantization fp8` as input; apply 1 byte/param for weight estimation. -## Recommendations +### Recommendations -| Priority | Fix | Expected impact | -|:--------:|-----|:---------------:| -| 🔴 High | **Re-calibrate activation constants** from v0.19.0 measurements. Current constants are 2–7× too high. Updating to ~1.0–2.2 GiB/architecture would remove the single largest prediction error. | +4–10 GiB KV accuracy | -| 🔴 High | **Accept `--kv-cache-dtype` as a planner input.** When set to `fp8`, halve the per-token KV bytes. This is a one-line formula change. | 2× token/concurrency accuracy for fp8-KV runs | -| 🔴 High | **Accept `--dtype` as a planner input.** When set to `float32`, double the per-param bytes for weight estimation. | Fixes −50% weight error for float32 runs | -| 🟡 Medium | **Re-measure non-torch constants for TP≥2 and PP≥2.** NCCL overhead scales with both and is currently under-estimated by ~3.5×. | +1–2 GiB KV accuracy for multi-GPU | -| 🟡 Medium | **Scale activation constant by 1/PP.** Each pipeline stage processes layers/PP transformer blocks; profiling overhead scales proportionally. | Fixes growing activation error at high PP | -| 🟢 Low | **Use physical GPU memory** (79.19 GiB for H100) rather than the catalog 80 GiB nominal. | +0.77 GiB KV accuracy | \ No newline at end of file +| Priority | Input to add | Expected impact | +|:--------:|-------------|:---------------:| +| High | **Re-calibrate activation constants** from v0.19.0 measurements. Current constants are 2–7× too high. | Removes largest single error source | +| High | **`kv_cache_dtype`** — when `fp8`, use 1 byte/token for KV. | Fixes ~2× token/concurrency error for fp8-KV runs | +| Medium | **`dtype`** — when `float32`, double bytes-per-param. | Fixes −50% weight error for float32 runs | +| Medium | **`quantization fp8` (runtime)** — apply 1 byte/param. | Fixes +76% weight error for runtime-fp8 runs | +| Medium | **Re-measure non-torch constants for TP≥2 and PP≥2.** | +1–2 GiB KV accuracy for multi-GPU | +| Medium | **Scale activation constant by 1/PP.** | Fixes growing activation error at high PP | +| Low | **Use physical GPU memory** (79.19 GiB) instead of catalog 80 GiB. | +0.77 GiB KV accuracy | \ No newline at end of file diff --git a/accuracy/scripts/analyze.py b/accuracy/scripts/analyze.py index 377fed6c..64ed225f 100644 --- a/accuracy/scripts/analyze.py +++ b/accuracy/scripts/analyze.py @@ -128,14 +128,89 @@ def _row_key(r: dict) -> tuple: # ── Segment helpers ─────────────────────────────────────────────────────────── -def where(fn): - return [r for r in rows_data if fn(r)] +# Parameters the capacity planner currently accepts as inputs: +# model, tp, pp, dp, max_model_len, gpu_memory_utilization +# Parameters NOT yet modeled (go in Part 2 / Next Steps): +# --kv-cache-dtype, --dtype override, runtime --quantization fp8 + +def where(fn, pool=None): + src = pool if pool is not None else rows_data + return [r for r in src if fn(r)] + +# Part 1: only runs whose config is fully within the planner's input space. +# Excludes: dtype=float32 override, runtime fp8 quant, kv_cache_dtype=fp8. +def _part1(r: dict) -> bool: + return (r["dtype"] != "torch.float32" + and r["quantization"] != "fp8" + and r["kv_cache_dtype"] != "fp8") + +part1 = where(_part1) + +# Baseline: single-GPU, default context, unquantized — one clean row per model. +# Uses a seen-set to deduplicate overlapping sweep rows (tp=1 appears in both +# the TP sweep and the max_model_len sweep for the same model). +_base_seen: set = set() +base: list = [] +for r in where(lambda r: r["tp"] == 1 and r["pp"] == 1 and r["dp"] == 1 + and r["max_model_len"] == 8192 + and r["quantization"] in ("None", "", None), pool=part1): + if r["model"] not in _base_seen: + _base_seen.add(r["model"]) + base.append(r) + +# Sensitivity sweeps within Part 1 +def _dedup(rows: list[dict], key_fn) -> list[dict]: + seen: set = set() + out: list[dict] = [] + for r in rows: + k = key_fn(r) + if k not in seen: + seen.add(k) + out.append(r) + return out + +tp_sweep_llama = _dedup( + sorted(where(lambda r: r["model"] == "meta-llama/Llama-3.1-8B-Instruct" + and r["pp"] == 1 and r["max_model_len"] == 8192 + and r["quantization"] in ("None", "", None), pool=part1), + key=lambda r: r["tp"]), + key_fn=lambda r: r["tp"]) + +tp_sweep_qwen = _dedup( + sorted(where(lambda r: r["model"] == "Qwen/Qwen2.5-7B-Instruct" + and r["pp"] == 1 and r["max_model_len"] == 8192 + and r["quantization"] in ("None", "", None), pool=part1), + key=lambda r: r["tp"]), + key_fn=lambda r: r["tp"]) + +pp_sweep = _dedup( + sorted(where(lambda r: r["model"] == "meta-llama/Llama-3.1-8B-Instruct" + and r["tp"] == 1 and r["max_model_len"] == 8192 + and r["quantization"] in ("None", "", None), pool=part1), + key=lambda r: r["pp"]), + key_fn=lambda r: r["pp"]) + +len_sweep_llama = _dedup( + sorted(where(lambda r: r["model"] == "meta-llama/Llama-3.1-8B-Instruct" + and r["tp"] == 1 and r["pp"] == 1 + and r["quantization"] in ("None", "", None), pool=part1), + key=lambda r: r["max_model_len"]), + key_fn=lambda r: r["max_model_len"]) + +len_sweep_qwen = _dedup( + sorted(where(lambda r: r["model"] == "Qwen/Qwen2.5-7B-Instruct" + and r["tp"] == 1 and r["pp"] == 1 + and r["quantization"] in ("None", "", None), pool=part1), + key=lambda r: r["max_model_len"]), + key_fn=lambda r: r["max_model_len"]) -base = where(lambda r: r["tp"] == 1 and r["pp"] == 1 and r["max_model_len"] == 8192 - and r["quantization"] in ("None", "", None) and r["kv_cache_dtype"] != "fp8") -multi = where(lambda r: r["tp"] > 1 or r["pp"] > 1) -quant = where(lambda r: r["quantization"] not in ("None", "", None)) -kvfp8 = where(lambda r: r["kv_cache_dtype"] == "fp8") +# Part 2: runs that exercise parameters not yet modeled by the planner. +kvfp8_rows = where(lambda r: r["kv_cache_dtype"] == "fp8") +dtype_rows = where(lambda r: r["dtype"] == "torch.float32" + or (r["dtype"] == "torch.float16" + and r["quantization"] in ("None", "", None))) +quant_rows = where(lambda r: r["quantization"] not in ("None", "", None) + or r["quantization"] == "fp8") # ── Report builder ──────────────────────────────────────────────────────────── @@ -145,85 +220,77 @@ def where(fn): def section(title: str, rows: list[dict]): - W(f"\n### {title} (n={len(rows)})\n") + W(f"\n#### {title} (n={len(rows)})\n") W("| Component | Mean error | Median | Mean abs | Min | Max | n |") W("|-----------|:----------:|:------:|:--------:|:---:|:---:|:-:|") - for key in ["weight", "activation", "non_torch", "cuda_graph", - "total_non_kv", "kv_cache", "max_concurrency"]: + for key in ["weight", "activation", "non_torch", "kv_cache", "max_concurrency"]: W(stats_row(key.replace("_", " ").title(), [r[f"err_{key}"] for r in rows])) +# ── Summary stats (Part 1) ──────────────────────────────────────────────────── + +kv_errs = [r["err_kv_cache"] for r in part1 if not math.isnan(r["err_kv_cache"])] +kv_base = [r["err_kv_cache"] for r in base if not math.isnan(r["err_kv_cache"])] +wt_errs = [r["err_weight"] for r in part1 if not math.isnan(r["err_weight"])] +act_errs = [r["err_activation"] for r in part1 if not math.isnan(r["err_activation"])] +nt_errs = [r["err_non_torch"] for r in part1 if not math.isnan(r["err_non_torch"])] +conc_errs = [r["err_max_concurrency"] for r in part1 if not math.isnan(r["err_max_concurrency"])] + +kv_mean = statistics.mean(kv_errs) +kv_abs = statistics.mean(abs(e) for e in kv_errs) +kv_base_m = statistics.mean(kv_base) +wt_mean = statistics.mean(wt_errs) +wt_abs = statistics.mean(abs(e) for e in wt_errs) +act_mean = statistics.mean(act_errs) +act_abs = statistics.mean(abs(e) for e in act_errs) +nt_mean = statistics.mean(nt_errs) +conc_mean = statistics.mean(conc_errs) +conc_abs = statistics.mean(abs(e) for e in conc_errs) + # ═══════════════════════════════════════════════════════════════════════════════ W("# Capacity Planner Accuracy Report — vLLM v0.19.0 / H100-80GB") W("") -W(f"**Dataset**: {len(rows_data)} successful runs across " - f"{len(set(r['model'] for r in rows_data))} unique models ") -W("**Hardware**: H100-80GB (catalog memory = 80 GiB, actual = ~79.19 GiB) ") -W("**Planner GPU util**: actual `gpu_memory_utilization` per run (0.95) ") +W("**Hardware**: H100-80GB (catalog 80 GiB, physical 79.19 GiB) ") +W("**Planner inputs evaluated**: model, tp, pp, dp, max_model_len, gpu_memory_utilization ") +W("") +W("> Percent error = (predicted − actual) / actual × 100. " + "Positive = over-estimate, negative = under-estimate.") W("") -# ── Executive Summary ───────────────────────────────────────────────────────── -W("## Executive Summary\n") - -kv_errs_all = [r["err_kv_cache"] for r in rows_data if not math.isnan(r["err_kv_cache"])] -kv_errs_base = [r["err_kv_cache"] for r in base if not math.isnan(r["err_kv_cache"])] -act_errs = [r["err_activation"] for r in rows_data if not math.isnan(r["err_activation"])] -wt_errs = [r["err_weight"] for r in rows_data if not math.isnan(r["err_weight"])] -nt_errs = [r["err_non_torch"] for r in rows_data if not math.isnan(r["err_non_torch"])] -conc_errs = [r["err_max_concurrency"] for r in rows_data if not math.isnan(r["err_max_concurrency"])] - -kv_mean_all = statistics.mean(kv_errs_all) -kv_abs_all = statistics.mean(abs(e) for e in kv_errs_all) -kv_mean_base = statistics.mean(kv_errs_base) -act_mean = statistics.mean(act_errs) -act_abs = statistics.mean(abs(e) for e in act_errs) -wt_mean = statistics.mean(wt_errs) -wt_abs = statistics.mean(abs(e) for e in wt_errs) -conc_mean = statistics.mean(conc_errs) -conc_abs = statistics.mean(abs(e) for e in conc_errs) - -W("| Metric | Mean error | Mean abs error | Notes |") -W("|--------|:----------:|:--------------:|-------|") -W(f"| **KV Cache memory** (all 47 runs) | {fmt(kv_mean_all)} | {fmt(kv_abs_all)} | |") -W(f"| **KV Cache memory** (baseline: tp=pp=1, len=8192, no-quant) | {fmt(kv_mean_base)} | — | n={len(kv_errs_base)} |") -W(f"| **Weight memory** | {fmt(wt_mean)} | {fmt(wt_abs)} | From safetensors metadata |") -W(f"| **Activation memory** | {fmt(act_mean)} | {fmt(act_abs)} | Largest error source |") -W(f"| **Non-torch overhead** | {fmt(statistics.mean(nt_errs))} | {fmt(statistics.mean(abs(e) for e in nt_errs))} | |") -W(f"| **Max concurrency** | {fmt(conc_mean)} | {fmt(conc_abs)} | Proxy for KV cache accuracy |") +# ───────────────────────────────────────────────────────────────────────────── +W("## Part 1: Accuracy Evaluation") W("") -W("### Key Findings\n") -W(f"1. **Weights are accurate** — mean abs error {fmt(wt_abs)}, computed directly from " - "safetensors parameter counts. Errors arise only when `--dtype` overrides the native " - "dtype (e.g., `--dtype float32`) or when quantization is not fully captured in the config.") -W(f"2. **Activation is the dominant error source** — mean {fmt(act_mean)} (over-estimate). " - "The planner uses empirical constants (4.8–8.0 GiB) measured at `max_model_len=16000`; " - "vLLM v0.19.0 reports 0.75–2.2 GiB across all architectures tested. Granite is worst (+600%), " - "Mistral3/Pixtral is best (+15–23%).") -W("3. **Over-estimated activation partially cancels** the catalog GPU memory inflation (+0.77 GiB), " - f"leaving KV cache only {fmt(kv_mean_all)} off on average across all runs. But this is " - "coincidental cancellation of two large opposing errors, not model accuracy.") -W("4. **Non-default KV dtype (`--kv-cache-dtype fp8`) doubles token capacity** but the planner " - "ignores this flag — KV token count is off by ~2× for those runs.") -W("5. **`--dtype float32` breaks weight prediction** — the planner uses the HuggingFace " - "config dtype (BF16) and never sees the vLLM `--dtype` override, giving −50% weight error.") -W("6. **Pipeline parallelism reduces actual activation** (each GPU processes fewer layers) " - "but the formula uses the same constant regardless of PP, compounding the activation error.") +W(f"Covers {len(part1)} runs across {len(set(r['model'] for r in part1))} models " + "using only parameters the planner currently accepts as inputs. " + "Excludes runs with `--dtype float32`, runtime `--quantization fp8`, " + "and `--kv-cache-dtype fp8` (see Part 2).") W("") -# ── Component Error Tables ──────────────────────────────────────────────────── -W("## Component-Level Error Breakdown\n") -W("> Percent error = (predicted − actual) / actual × 100. " - "Positive = over-estimate, negative = under-estimate.\n") - -section("All 47 Runs", rows_data) -section("Baseline: TP=1, PP=1, len=8192, no quantization, default KV dtype", base) -section("Multi-GPU (TP > 1 or PP > 1)", multi) -section("Quantized Models (fp8-dynamic / w8a8 / w4a16)", quant) -section("Non-default KV cache dtype (--kv-cache-dtype fp8)", kvfp8) +# ── Summary table ───────────────────────────────────────────────────────────── +W("### Summary\n") +W("| Metric | Mean error | Mean abs error | n |") +W("|--------|:----------:|:--------------:|:-:|") +W(f"| KV cache memory (all runs) | {fmt(kv_mean)} | {fmt(kv_abs)} | {len(kv_errs)} |") +W(f"| KV cache memory (baseline: tp=pp=dp=1, len=8192, no quant) | {fmt(kv_base_m)} | — | {len(kv_base)} |") +W(f"| Weight memory | {fmt(wt_mean)} | {fmt(wt_abs)} | {len(wt_errs)} |") +W(f"| Activation memory | {fmt(act_mean)} | {fmt(act_abs)} | {len(act_errs)} |") +W(f"| Non-torch overhead | {fmt(nt_mean)} | — | {len(nt_errs)} |") +W(f"| Max concurrency | {fmt(conc_mean)} | {fmt(conc_abs)} | {len(conc_errs)} |") +W("") +W("**Key findings**:\n") +W(f"- **Weight memory is accurate**: mean abs error {fmt(wt_abs)}, " + "computed directly from safetensors parameter counts.") +W(f"- **KV cache memory is close**: {fmt(kv_mean)} mean error across all runs; " + f"{fmt(kv_base_m)} at baseline. Errors are small and consistent.") +W(f"- **Activation is the dominant error source**: mean {fmt(act_mean)} (over-estimate). " + "The planner uses empirical constants measured against an older vLLM version; " + "v0.19.0 reports substantially lower values. See Root Cause Analysis.") +W(f"- **Max concurrency tracks KV accuracy**: {fmt(conc_mean)} mean error; " + "deviations come from the per-token KV formula, not the pool size prediction.") +W("") -# ── Per-Model Error Table ───────────────────────────────────────────────────── -W("\n## Per-Model Errors — Baseline Runs\n") -W("> TP=1, PP=1, max_model_len=8192, no quantization, default KV dtype.\n") +# ── Per-model baseline ───────────────────────────────────────────────────────── +W("### Per-Model Results — Baseline (TP=1, PP=1, DP=1, len=8192, no quantization)\n") W("| Model | Arch | Weight err | Activation err | Non-torch err | KV cache err | Max conc err |") W("|-------|------|:----------:|:--------------:|:-------------:|:------------:|:------------:|") for r in sorted(base, key=lambda x: x["model"]): @@ -235,315 +302,223 @@ def section(title: str, rows: list[dict]): f"{fmt(r['err_weight'])} | {fmt(r['err_activation'])} | " f"{fmt(r['err_non_torch'])} | {fmt(r['err_kv_cache'])} | " f"{fmt(r['err_max_concurrency'])} |") - -# ═══════════════════════════════════════════════════════════════════════════════ -W("\n## Argument Sensitivity Analysis\n") -W("> This section examines how each vLLM launch argument affects whether the " - "capacity planner's memory predictions remain accurate.\n") - -# ── max_model_len ───────────────────────────────────────────────────────────── -W("### `--max-model-len` (context window size)\n") - -llama_len = where(lambda r: "Llama-3.1-8B-Instruct" in r["model"] - and r["tp"] == 1 and r["pp"] == 1 - and r["quantization"] in ("None", "", None) - and r["kv_cache_dtype"] != "fp8") -llama_len.sort(key=lambda r: r["max_model_len"]) - -qwen_len = where(lambda r: r["model"] == "Qwen/Qwen2.5-7B-Instruct" - and r["tp"] == 1 and r["pp"] == 1 - and r["quantization"] in ("None", "", None) - and r["kv_cache_dtype"] != "fp8") -qwen_len.sort(key=lambda r: r["max_model_len"]) - -W("| Model | max_model_len | Actual KV (GiB) | KV err | Actual tokens | Pred tokens | Token err | Max conc err |") -W("|-------|:-------------:|:---------------:|:------:|:-------------:|:-----------:|:---------:|:------------:|") -for r in llama_len + qwen_len: - model_short = r["model"].split("/")[-1][:28] - W(f"| {model_short} | {r['max_model_len']:,} | " - f"{fv(r['actual_kv_cache'])} | {fmt(r['err_kv_cache'])} | " - f"{int(r['actual_kv_tokens']):,} | {int(r['pred_kv_tokens']):,} | " - f"{fmt(r['err_kv_tokens'])} | {fmt(r['err_max_concurrency'])} |") - W("") -W("**Conclusion**: `--max-model-len` has **no effect on KV pool size** — the formula and " - "vLLM agree on this. Activation memory is constant (the fixed profiling overhead does not " - "depend on context length), so the KV pool prediction error stays flat at ~−3 to −4% " - "regardless of whether context is 2 K or 32 K tokens. The token/concurrency predictions " - "carry that same constant KV error forward, plus any error from the per-token KV formula.") - -# ── TP ──────────────────────────────────────────────────────────────────────── -W("\n### `--tensor-parallel-size` (TP)\n") - -tp_sweep = where(lambda r: r["model"] == "meta-llama/Llama-3.1-8B-Instruct" - and r["max_model_len"] == 8192 - and r["quantization"] in ("None", "", None) - and r["kv_cache_dtype"] != "fp8" - and r["pp"] == 1) -tp_sweep.sort(key=lambda r: r["tp"]) - -qwen_tp = where(lambda r: r["model"] == "Qwen/Qwen2.5-7B-Instruct" - and r["max_model_len"] == 8192 - and r["quantization"] in ("None", "", None) - and r["kv_cache_dtype"] != "fp8" - and r["pp"] == 1) -qwen_tp.sort(key=lambda r: r["tp"]) +# ── TP sensitivity ───────────────────────────────────────────────────────────── +W("### Sensitivity: Tensor Parallelism (TP)\n") W("| Model | TP | Actual weight (GiB) | Weight err | Actual activ (GiB) | Activation err | Actual non-torch (GiB) | Non-torch err | KV cache err |") W("|-------|:--:|:-------------------:|:----------:|:------------------:|:--------------:|:---------------------:|:-------------:|:------------:|") -for r in tp_sweep + qwen_tp: - model_short = r["model"].split("/")[-1][:22] - W(f"| {model_short} | {r['tp']} | " +for r in tp_sweep_llama + tp_sweep_qwen: + W(f"| {r['model'].split('/')[-1][:22]} | {r['tp']} | " f"{fv(r['actual_weight'])} | {fmt(r['err_weight'])} | " f"{fv(r['actual_activation'])} | {fmt(r['err_activation'])} | " f"{fv(r['actual_non_torch'])} | {fmt(r['err_non_torch'])} | " f"{fmt(r['err_kv_cache'])} |") - W("") -W("**Conclusions**:\n") -W("- **Weights scale correctly**: the formula divides by TP, matching vLLM's per-GPU weight sharding. " - "Weight error stays near 0% across TP=1–4.") -W("- **Activation is TP-invariant in both formula and reality**: vLLM's profiling overhead does not " - "shrink with TP (it captures the same set of batch sizes). The formula also keeps activation " - "constant with TP. Error stays flat.") -W("- **Non-torch is heavily under-estimated for TP≥2**: the 0.60 GiB/GPU constant does not capture " - "NCCL all-reduce buffer overhead, which grows with TP. Actual non-torch reaches ~2.1 GiB/GPU at " - "TP=4 (3.5× the constant). However, this error is partially masked in KV cache accuracy because " - "the over-estimated activation pulls the prediction in the opposite direction.") - -# ── PP ──────────────────────────────────────────────────────────────────────── -W("\n### `--pipeline-parallel-size` (PP)\n") - -pp_sweep = where(lambda r: r["model"] == "meta-llama/Llama-3.1-8B-Instruct" - and r["max_model_len"] == 8192 - and r["quantization"] in ("None", "", None) - and r["kv_cache_dtype"] != "fp8" - and r["tp"] == 1) -pp_sweep.sort(key=lambda r: r["pp"]) +W("- **Weights scale correctly** with TP: error stays near 0% across TP=1–4.") +W("- **Activation is TP-invariant** in both formula and vLLM: error stays flat.") +W("- **Non-torch is under-estimated at TP≥2**: NCCL all-reduce buffers push actual to " + "~2.1 GiB/GPU but the constant is 0.60 GiB. The opposing over-estimate in activation " + "partially masks this in the KV cache error.") +W("") +# ── PP sensitivity ───────────────────────────────────────────────────────────── +W("### Sensitivity: Pipeline Parallelism (PP)\n") +W("Model: meta-llama/Llama-3.1-8B-Instruct, TP=1, len=8192\n") W("| PP | Actual weight (GiB) | Weight err | Actual activ (GiB) | Activation err | Actual non-torch (GiB) | Non-torch err | KV cache err |") W("|:--:|:-------------------:|:----------:|:------------------:|:--------------:|:---------------------:|:-------------:|:------------:|") +pp_acts = {} +pp_preds = {} for r in pp_sweep: + pp_acts[r["pp"]] = r["actual_activation"] + pp_preds[r["pp"]] = r["pred_activation"] W(f"| {r['pp']} | " f"{fv(r['actual_weight'])} | {fmt(r['err_weight'])} | " f"{fv(r['actual_activation'])} | {fmt(r['err_activation'])} | " f"{fv(r['actual_non_torch'])} | {fmt(r['err_non_torch'])} | " f"{fmt(r['err_kv_cache'])} |") - -# Compute activation values directly for the prose -pp_acts = {r["pp"]: r["actual_activation"] for r in pp_sweep} -pp_preds = {r["pp"]: r["pred_activation"] for r in pp_sweep} W("") -W("**Conclusions**:\n") -W(f"- **Activation drops sharply with PP**: at PP=1, vLLM profiles {fv(pp_acts.get(1,float('nan')))} GiB " - f"of activation; at PP=2 it drops to {fv(pp_acts.get(2,float('nan')))} GiB; " - f"at PP=4 to {fv(pp_acts.get(4,float('nan')))} GiB. " - "Each pipeline stage runs fewer transformer layers, so the profiling sweep allocates proportionally less. " - f"The formula does not account for this and always predicts {fv(pp_preds.get(1,float('nan')))} GiB, " - "making the activation error grow with PP (from ~+154% at PP=1 to ~+357% at PP=4).") -W("- **Non-torch increases with PP** due to inter-stage P2P send/receive buffers, " - "but the formula uses the same TP=1 constant (0.15 GiB/GPU) regardless of PP, " - "causing the non-torch estimate to overshoot actual (predicted > actual for PP>1 because " - "each stage is a separate process and 0.15 is per-GPU). " - "These two errors partially offset each other in the KV cache prediction.") -W("- **Weight error grows with PP**: the formula divides only by TP×PP for weight sharding, " - "but with PP=4, model layers are not uniformly distributed across stages in all cases " - "(irregular last-stage allocation can leave a stage with fewer params).") - -# ── dtype ───────────────────────────────────────────────────────────────────── -W("\n### `--dtype` (compute/storage dtype override)\n") - -dtype_sweep = where(lambda r: "Llama-3.1" in r["model"] - and r["tp"] == 1 and r["pp"] == 1 and r["max_model_len"] == 8192) -dtype_sweep.sort(key=lambda r: (r["dtype"], r["quantization"], r["kv_cache_dtype"])) - -W("| dtype arg | quantization | kv_cache_dtype | Actual weight (GiB) | Weight err | Actual KV (GiB) | KV err |") -W("|-----------|:------------:|:--------------:|:-------------------:|:----------:|:---------------:|:------:|") -for r in dtype_sweep: - W(f"| {r['dtype'].replace('torch.', '')} | {r['quantization']} | " - f"{r['kv_cache_dtype']} | " - f"{fv(r['actual_weight'])} | {fmt(r['err_weight'])} | " - f"{fv(r['actual_kv_cache'])} | {fmt(r['err_kv_cache'])} |") - +W(f"- **Activation drops with PP**: " + f"PP=1 → {fv(pp_acts.get(1,float('nan')))} GiB, " + f"PP=2 → {fv(pp_acts.get(2,float('nan')))} GiB, " + f"PP=4 → {fv(pp_acts.get(4,float('nan')))} GiB. " + f"The formula always predicts {fv(pp_preds.get(1,float('nan')))} GiB regardless of PP.") +W("- **Weight error grows with PP**: layer imbalance across stages causes the formula " + "(which assumes uniform distribution) to deviate at high PP.") W("") -W("**Conclusions**:\n") -W("- **`--dtype float32`** doubles model weight memory (29.98 GiB vs BF16's 14.99 GiB). " - "The planner reads the HuggingFace config dtype (BF16) and is unaware of the `--dtype` " - "vLLM override, so it predicts 14.96 GiB — a **−50% weight error**, which cascades into " - "a +31% KV cache over-prediction (the planner thinks there is more room than there is).") -W("- **`--dtype float16`** is handled correctly because the HuggingFace config also stores " - "float16 for these models; weight error stays near 0%.") -W("- **FP8-dynamic quantization** (`fp8` in the quantization column) halves weight memory. " - "The planner reads `quantization_config` from the HuggingFace repo and applies the FP8 " - "byte-per-param, yielding near-zero weight error. KV cache error stays consistent with " - "the activation over-estimation.") -W("- **`--kv-cache-dtype fp8`** does not affect weight or activation predictions, but halves " - "per-token KV storage. The planner ignores this flag and predicts KV tokens ~50% too low " - "(see dedicated section below).") - -# ── quantization ────────────────────────────────────────────────────────────── -W("\n### `--quantization` (weight quantization method)\n") - -quant_rows = where(lambda r: r["quantization"] not in ("None", "", None)) -quant_rows.sort(key=lambda r: (r["quantization"], r["model"])) - -W("| Model | quant method | TP | Actual weight (GiB) | Weight err | Actual KV (GiB) | KV err |") -W("|-------|--------------|----|:-------------------:|:----------:|:---------------:|:------:|") -for r in quant_rows: - model_short = r["model"].split("/")[-1][:30] - W(f"| {model_short} | {r['quantization']} | {r['tp']} | " - f"{fv(r['actual_weight'])} | {fmt(r['err_weight'])} | " - f"{fv(r['actual_kv_cache'])} | {fmt(r['err_kv_cache'])} |") +# ── max_model_len sensitivity ────────────────────────────────────────────────── +W("### Sensitivity: Context Length (max_model_len)\n") +W("| Model | max_model_len | Actual KV (GiB) | KV err | Actual tokens | Pred tokens | Token err |") +W("|-------|:-------------:|:---------------:|:------:|:-------------:|:-----------:|:---------:|") +for r in len_sweep_llama + len_sweep_qwen: + W(f"| {r['model'].split('/')[-1][:28]} | {r['max_model_len']:,} | " + f"{fv(r['actual_kv_cache'])} | {fmt(r['err_kv_cache'])} | " + f"{int(r['actual_kv_tokens']):,} | {int(r['pred_kv_tokens']):,} | " + f"{fmt(r['err_kv_tokens'])} |") W("") -W("**Conclusions**:\n") -W("- **w8a8 (compressed-tensors INT8)**: the planner parses `config_groups` from the " - "`quantization_config` to find `num_bits=8` and applies 1 byte/param. Weight errors " - "are near zero (−0.3 to −0.7%), indicating the INT8 parameter count is well-captured.") -W("- **w4a16 (GPTQ-marlin INT4)**: the planner parses `num_bits=4` from the quantization " - "config and applies 0.5 bytes/param. Weight error is small (~−0.7%). " - "The large reduction in weights (5.3 GiB vs 15 GiB for BF16) frees more KV cache, " - "and the planner correctly tracks this effect — KV error stays in the −3% range.") -W("- **fp8-dynamic** (fp8 per-tensor dynamic quant via `compressed-tensors`): " - "the planner extracts fp8 precision from the quantization config. " - "Weight error is near zero. Unexpectedly, weight error for the RedHat fp8 70B model " - "at TP=2 stays very low, confirming the quant config parsing is correct for this variant.") - -# ── kv_cache_dtype ──────────────────────────────────────────────────────────── -W("\n### `--kv-cache-dtype` (KV cache precision)\n") - -kv_dtype_rows = where(lambda r: r["kv_cache_dtype"] == "fp8") -kv_dtype_rows.sort(key=lambda r: r["model"]) - -# Find the matching default-kv rows for the same model -kv_default_rows = [] -for kfp8 in kv_dtype_rows: - match = where(lambda r, m=kfp8: (r["model"] == m["model"] - and r["tp"] == m["tp"] - and r["pp"] == m["pp"] - and r["max_model_len"] == m["max_model_len"] - and r["kv_cache_dtype"] != "fp8" - and r["quantization"] in ("None","",None))) - if match: - kv_default_rows.append(match[0]) - -W("| Model | kv_cache_dtype | Actual KV (GiB) | KV err | Actual tokens | Pred tokens | Token err | Conc err |") -W("|-------|:--------------:|:---------------:|:------:|:-------------:|:-----------:|:---------:|:--------:|") -for row_pair in zip(kv_default_rows, kv_dtype_rows): - for r in row_pair: - model_short = r["model"].split("/")[-1][:28] - W(f"| {model_short} | {r['kv_cache_dtype']} | " - f"{fv(r['actual_kv_cache'])} | {fmt(r['err_kv_cache'])} | " - f"{int(r['actual_kv_tokens']):,} | {int(r['pred_kv_tokens']):,} | " - f"{fmt(r['err_kv_tokens'])} | {fmt(r['err_max_concurrency'])} |") - W("|||||||||") - +W("- **KV pool size (GiB) is independent of max_model_len**: both formula and vLLM agree. " + "The pool is sized from available memory, not from a pre-allocated token count.") +W("- **Token count predictions vary**: the per-token KV bytes formula has model-dependent " + "errors that show up consistently across all context lengths.") W("") -W("**Conclusion**: `--kv-cache-dtype fp8` stores each KV element in 1 byte instead of 2 bytes " - "(BF16/FP16), doubling the number of tokens that fit in the KV pool. The KV pool size in GiB " - "is unaffected (same activation and weight overhead), so the **KV GiB error stays near −4%** " - "(the same as the default-dtype baseline). But because the planner always computes per-token " - "bytes from the model's native compute dtype, **token count and max-concurrency predictions " - "are ~52% too low** for fp8-KV runs. This is a direct, fixable bug: the planner should accept " - "`kv_cache_dtype` as an input parameter and apply 1 byte/token when it is `fp8`.") - -# ── Root Cause Summary ──────────────────────────────────────────────────────── -W("\n## Root Cause Analysis\n") - -W("### 1. Activation Memory — Largest Error Source\n") -W("The planner uses **fixed constants per architecture** (e.g., 4.8 GiB for Llama, " - "5.6 GiB for Qwen2/3) empirically measured at `max_model_len=16000`. " - "vLLM v0.19.0 reports substantially lower values during its profiling phase:\n") -W("| Architecture | Predicted (GiB) | Observed range (GiB) | Error range |") -W("|-------------|:---------------:|:--------------------:|:-----------:|") + +# ── Root cause analysis ──────────────────────────────────────────────────────── +W("### Root Cause Analysis\n") + +W("#### 1. Activation Constants Are Stale\n") +W("The planner uses fixed constants per architecture (e.g., 4.8 GiB for Llama) " + "calibrated against an older vLLM version. vLLM v0.19.0 reports substantially lower values:\n") +W("| Architecture | Planner constant (GiB) | Observed v0.19.0 range (GiB) | Error range |") +W("|-------------|:---------------------:|:----------------------------:|:-----------:|") archs_seen: dict[str, list] = {} -for r in rows_data: - arch = r["architecture"] +for r in part1: if not math.isnan(r["err_activation"]): - archs_seen.setdefault(arch, []).append( + archs_seen.setdefault(r["architecture"], []).append( (r["actual_activation"], r["pred_activation"], r["err_activation"])) for arch, data in sorted(archs_seen.items()): acts = [d[0] for d in data] preds = [d[1] for d in data] errs = [d[2] for d in data] - arch_label = (arch.replace("ForCausalLM", "") - .replace("ForConditionalGeneration", "*"))[:35] + arch_label = (arch.replace("ForCausalLM","").replace("ForConditionalGeneration","*"))[:35] W(f"| {arch_label} | {fv(statistics.mean(preds))} | " f"{fv(min(acts))}–{fv(max(acts))} | " f"{fmt(min(errs))} to {fmt(max(errs))} |") W("") -W("The discrepancy suggests the constants were measured with an older vLLM version or " - "different compilation settings. Re-calibrating to these v0.19.0 measurements would be " - "the highest-value fix.") +W("Re-calibrating these constants from the v0.19.0 measurements is the highest-value fix.") -W("\n### 2. Non-torch Memory — Underestimated for Multi-GPU\n") -W("| TP | PP | Constant used | Actual mean (GiB) | Mean error |") -W("|:--:|:--:|:-------------:|:-----------------:|:----------:|") +W("\n#### 2. Non-torch Constants Under-estimated for Multi-GPU\n") +W("| TP | PP | Constant used (GiB) | Observed mean (GiB) | Mean error |") +W("|:--:|:--:|:-------------------:|:-------------------:|:----------:|") for tp_v, pp_v in [(1,1),(1,2),(1,4),(2,1),(4,1)]: - grp = where(lambda r, t=tp_v, p=pp_v: r["tp"]==t and r["pp"]==p) - if not grp: - continue - const = 0.15 if tp_v == 1 else 0.60 + grp = where(lambda r, t=tp_v, p=pp_v: r["tp"]==t and r["pp"]==p, pool=part1) + if not grp: continue + const = 0.15 if tp_v == 1 and pp_v == 1 else (0.60 if tp_v > 1 else 0.15) acts = [r["actual_non_torch"] for r in grp if not math.isnan(r["actual_non_torch"])] errs = [r["err_non_torch"] for r in grp if not math.isnan(r["err_non_torch"])] - if not acts: - continue - W(f"| {tp_v} | {pp_v} | {const} GiB | {fv(statistics.mean(acts))} | " + if not acts: continue + W(f"| {tp_v} | {pp_v} | {const} | {fv(statistics.mean(acts))} | " f"{fmt(statistics.mean(errs))} |") +W("") +W("TP≥2 requires NCCL all-reduce buffers (~2.1 GiB/GPU vs the 0.60 GiB constant). " + "PP≥2 adds P2P send/receive buffers that the formula ignores entirely.") -W("\nFor TP=1 the formula slightly under-estimates (0.15 vs ~0.25 GiB actual). " - "For TP≥2, NCCL all-reduce buffers push actual non-torch to ~2.1 GiB — 3.5× " - "the 0.60 GiB constant. For PP≥2, P2P send/receive adds overhead that the formula " - "doesn't model at all.") - -W("\n### 3. GPU Memory Catalog vs Physical\n") -W("The planner uses 80 GiB (catalog) but H100 physical VRAM is 79.19 GiB:\n") -W("- Catalog available: 80 × 0.95 = **76.00 GiB**") -W("- Physical available: 79.19 × 0.95 = **75.23 GiB**") -W("- Systematic KV over-prediction from this source alone: **+0.77 GiB**") +W("\n#### 3. GPU Catalog vs Physical Memory\n") +W("Planner uses 80 GiB (catalog); H100 physical VRAM is 79.19 GiB. \n" + "Effect: KV pool over-predicted by ~0.77 GiB (76.00 vs 75.23 GiB at 0.95 utilization).") -W("\n### 4. CUDA Graph Memory — Excluded from Formula\n") -cg_vals = [r["actual_cuda_graph"] for r in rows_data +cg_vals = [r["actual_cuda_graph"] for r in part1 if not math.isnan(r["actual_cuda_graph"]) and r["actual_cuda_graph"] > 0] -W("The planner returns 0.0 GiB for CUDA graphs (treating it as included in activation). " - "vLLM allocates the CUDA graph pool *after* sizing the KV cache, so the reported " - "KV pool includes CUDA graph memory. The formula is therefore consistent with the " - "log-reported KV number — no fix needed, but it should be documented.") if cg_vals: - W(f"\nObserved CUDA graph pool sizes: {fv(min(cg_vals))}–{fv(max(cg_vals))} GiB " - f"(mean {fv(statistics.mean(cg_vals))} GiB).") - -# ── Recommendations ─────────────────────────────────────────────────────────── -W("\n## Recommendations\n") -W("| Priority | Fix | Expected impact |") -W("|:--------:|-----|:---------------:|") -W("| 🔴 High | **Re-calibrate activation constants** from v0.19.0 measurements. " - "Current constants are 2–7× too high. Updating to ~1.0–2.2 GiB/architecture would " - "remove the single largest prediction error. | +4–10 GiB KV accuracy |") -W("| 🔴 High | **Accept `--kv-cache-dtype` as a planner input.** When set to `fp8`, " - "halve the per-token KV bytes. This is a one-line formula change. " - "| 2× token/concurrency accuracy for fp8-KV runs |") -W("| 🔴 High | **Accept `--dtype` as a planner input.** When set to `float32`, " - "double the per-param bytes for weight estimation. " + W(f"\n#### 4. CUDA Graph Memory\n") + W(f"Observed pool sizes: {fv(min(cg_vals))}–{fv(max(cg_vals))} GiB " + f"(mean {fv(statistics.mean(cg_vals))} GiB). " + "vLLM allocates CUDA graphs after sizing the KV cache, so the reported KV pool " + "already includes CUDA graph memory — no formula correction needed.") + +# ───────────────────────────────────────────────────────────────────────────── +W("\n---\n") +W("## Part 2: Next Steps — Parameters Not Yet Modeled\n") +W("The following vLLM flags affect memory allocation but are not yet accepted as " + "planner inputs. Each subsection quantifies the prediction gap to inform " + "which inputs to add next.") +W("") + +# ── kv_cache_dtype ───────────────────────────────────────────────────────────── +W("### `--kv-cache-dtype fp8`\n") +kv_fp8 = where(lambda r: r["kv_cache_dtype"] == "fp8") +kv_fp8.sort(key=lambda r: r["model"]) +kv_auto_map = {} +for r in where(lambda r: r["kv_cache_dtype"] != "fp8" + and r["quantization"] in ("None","",None)): + k = (r["model"], r["tp"], r["pp"], r["max_model_len"]) + kv_auto_map[k] = r + +W("| Model | kv_cache_dtype | Actual KV (GiB) | KV GiB err | Actual tokens | Pred tokens | Token err |") +W("|-------|:--------------:|:---------------:|:----------:|:-------------:|:-----------:|:---------:|") +for fp8r in kv_fp8: + k = (fp8r["model"], fp8r["tp"], fp8r["pp"], fp8r["max_model_len"]) + autr = kv_auto_map.get(k) + if autr: + W(f"| {autr['model'].split('/')[-1][:28]} | auto | " + f"{fv(autr['actual_kv_cache'])} | {fmt(autr['err_kv_cache'])} | " + f"{int(autr['actual_kv_tokens']):,} | {int(autr['pred_kv_tokens']):,} | " + f"{fmt(autr['err_kv_tokens'])} |") + W(f"| {fp8r['model'].split('/')[-1][:28]} | fp8 | " + f"{fv(fp8r['actual_kv_cache'])} | {fmt(fp8r['err_kv_cache'])} | " + f"{int(fp8r['actual_kv_tokens']):,} | {int(fp8r['pred_kv_tokens']):,} | " + f"{fmt(fp8r['err_kv_tokens'])} |") + W("||||||||") +W("") +W("**KV pool size (GiB) is unaffected** — fp8 halves per-token storage, not the pool. " + "The planner's GiB prediction stays accurate. " + "**Token count is ~2× too low** because the planner always uses the model's native " + "dtype (BF16 = 2 bytes/element) instead of fp8 (1 byte/element). " + "Fix: accept `kv_cache_dtype` as input; when `fp8`, use 1 byte/token.") +W("") + +# ── dtype override ───────────────────────────────────────────────────────────── +W("### `--dtype` override\n") +dtype_rows_all = where(lambda r: "Llama-3.1" in r["model"] + and r["tp"] == 1 and r["pp"] == 1 and r["max_model_len"] == 8192 + and r["quantization"] in ("None","",None)) +dtype_rows_all.sort(key=lambda r: r["dtype"]) +W("| dtype | Actual weight (GiB) | Weight err | Actual KV (GiB) | KV err |") +W("|-------|:-------------------:|:----------:|:---------------:|:------:|") +for r in dtype_rows_all: + W(f"| {r['dtype'].replace('torch.','')} | " + f"{fv(r['actual_weight'])} | {fmt(r['err_weight'])} | " + f"{fv(r['actual_kv_cache'])} | {fmt(r['err_kv_cache'])} |") +W("") +W("**`--dtype float32`** doubles weight memory. The planner reads the HF config dtype " + "(BF16) and has no visibility into the vLLM override → −50% weight error, +31% KV error. \n" + "**`--dtype float16`** matches the HF config for these models → near-zero error. \n" + "Fix: accept `dtype` as input and use it to override the bytes-per-param calculation.") +W("") + +# ── runtime quantization ─────────────────────────────────────────────────────── +W("### Runtime `--quantization fp8`\n") +rtfp8 = where(lambda r: r["quantization"] == "fp8") +W("| Model | Actual weight (GiB) | Weight err | Actual KV (GiB) | KV err |") +W("|-------|:-------------------:|:----------:|:---------------:|:------:|") +for r in rtfp8: + W(f"| {r['model'].split('/')[-1][:35]} | " + f"{fv(r['actual_weight'])} | {fmt(r['err_weight'])} | " + f"{fv(r['actual_kv_cache'])} | {fmt(r['err_kv_cache'])} |") +W("") +W("Runtime `--quantization fp8` compresses weights on-the-fly after loading. " + "vLLM logs the post-compression size (~half of BF16). The planner finds no " + "`quantization_config` in the HF repo and predicts the full BF16 weight → ~+76% weight error. \n" + "Fix: accept `quantization fp8` as input; apply 1 byte/param for weight estimation.") +W("") + +# ── Recommendations ──────────────────────────────────────────────────────────── +W("### Recommendations\n") +W("| Priority | Input to add | Expected impact |") +W("|:--------:|-------------|:---------------:|") +W("| High | **Re-calibrate activation constants** from v0.19.0 measurements. " + "Current constants are 2–7× too high. | Removes largest single error source |") +W("| High | **`kv_cache_dtype`** — when `fp8`, use 1 byte/token for KV. " + "| Fixes ~2× token/concurrency error for fp8-KV runs |") +W("| Medium | **`dtype`** — when `float32`, double bytes-per-param. " "| Fixes −50% weight error for float32 runs |") -W("| 🟡 Medium | **Re-measure non-torch constants for TP≥2 and PP≥2.** " - "NCCL overhead scales with both and is currently under-estimated by ~3.5×. " +W("| Medium | **`quantization fp8` (runtime)** — apply 1 byte/param. " + "| Fixes +76% weight error for runtime-fp8 runs |") +W("| Medium | **Re-measure non-torch constants for TP≥2 and PP≥2.** " "| +1–2 GiB KV accuracy for multi-GPU |") -W("| 🟡 Medium | **Scale activation constant by 1/PP.** " - "Each pipeline stage processes layers/PP transformer blocks; " - "profiling overhead scales proportionally. " +W("| Medium | **Scale activation constant by 1/PP.** " "| Fixes growing activation error at high PP |") -W("| 🟢 Low | **Use physical GPU memory** (79.19 GiB for H100) rather than " - "the catalog 80 GiB nominal. | +0.77 GiB KV accuracy |") +W("| Low | **Use physical GPU memory** (79.19 GiB) instead of catalog 80 GiB. " + "| +0.77 GiB KV accuracy |") report = "\n".join(lines) OUT_MD.write_text(report) print(f"Report written → {OUT_MD}") print(f"\n{'─'*60}") -print("HEADLINE NUMBERS") +print("HEADLINE NUMBERS (Part 1 — supported inputs only)") print(f"{'─'*60}") -print(f" KV cache mean error (all): {fmt(kv_mean_all)}") -print(f" KV cache mean error (baseline): {fmt(kv_mean_base)}") +print(f" KV cache mean error (all): {fmt(kv_mean)}") +print(f" KV cache mean error (baseline): {fmt(kv_base_m)}") print(f" Weights mean abs error: {fmt(wt_abs)}") print(f" Activation mean error: {fmt(act_mean)}") print(f" Activation mean abs error: {fmt(act_abs)}") diff --git a/accuracy/scripts/sweep.yaml b/accuracy/scripts/sweep.yaml index 2feb7647..581fe600 100644 --- a/accuracy/scripts/sweep.yaml +++ b/accuracy/scripts/sweep.yaml @@ -242,3 +242,32 @@ runs: # tp: 1 # _label: fp8dyn-redhatai-qwen2-5-7b # _sweep_dim: quantization + + # ── vLLM version sensitivity: activation memory across releases ─────────── + # Goal: validate whether activation constants change across vLLM versions. + # Model: Qwen3-14B (tp=1, ~28 GiB bf16; tp=5 invalid — vocab not divisible). + # See sweep-versions.yaml to run these; results go in results/v/ + # DONE: v0.15.0 + # - model: Qwen/Qwen3-14B + # tp: 1 + # vllm_image: vllm/vllm-openai:v0.15.0 + # _label: qwen3-14b-vllm-v0.15.0 + # _sweep_dim: vllm_version + # DONE: v0.16.0 + # - model: Qwen/Qwen3-14B + # tp: 1 + # vllm_image: vllm/vllm-openai:v0.16.0 + # _label: qwen3-14b-vllm-v0.16.0 + # _sweep_dim: vllm_version + # DONE: v0.17.0 + # - model: Qwen/Qwen3-14B + # tp: 1 + # vllm_image: vllm/vllm-openai:v0.17.0 + # _label: qwen3-14b-vllm-v0.17.0 + # _sweep_dim: vllm_version + # DONE: v0.18.0 + # - model: Qwen/Qwen3-14B + # tp: 1 + # vllm_image: vllm/vllm-openai:v0.18.0 + # _label: qwen3-14b-vllm-v0.18.0 + # _sweep_dim: vllm_version From a95e9f62e5de8cb5d2beb5f5279e7271a231416d Mon Sep 17 00:00:00 2001 From: Jing Chen Date: Wed, 22 Apr 2026 18:05:18 -0400 Subject: [PATCH 08/24] Update report Signed-off-by: Jing Chen --- accuracy/results/v0.19.0/accuracy_report.md | 33 ++++++++++--------- .../results/v0.19.0/results_predicted.csv | 5 +-- accuracy/results/v0.19.0/results_raw.csv | 5 +++ accuracy/results/v0.19.0/run_matrix.md | 11 +++++-- ...-gemma-7b--h100-80gb--tp1pp1dp1--8192.json | 29 ++++++++++++++++ ...oft-phi-2--h100-80gb--tp1pp1dp1--2048.json | 29 ++++++++++++++++ ...t-oss-20b--h100-80gb--tp2pp1dp1--8192.json | 29 ++++++++++++++++ accuracy/results/version-sweep/run_matrix.md | 31 +++++++++++++++++ ...m-v0-15-0--h100-80gb--tp1pp1dp1--8192.json | 29 ++++++++++++++++ ...m-v0-16-0--h100-80gb--tp1pp1dp1--8192.json | 29 ++++++++++++++++ ...m-v0-17-0--h100-80gb--tp1pp1dp1--8192.json | 29 ++++++++++++++++ ...m-v0-18-0--h100-80gb--tp1pp1dp1--8192.json | 30 +++++++++++++++++ accuracy/scripts/sweep.yaml | 11 ++++++- 13 files changed, 279 insertions(+), 21 deletions(-) create mode 100644 accuracy/results/v0.19.0/runs/google-gemma-7b--h100-80gb--tp1pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/microsoft-phi-2--h100-80gb--tp1pp1dp1--2048.json create mode 100644 accuracy/results/v0.19.0/runs/openai-gpt-oss-20b--h100-80gb--tp2pp1dp1--8192.json create mode 100644 accuracy/results/version-sweep/run_matrix.md create mode 100644 accuracy/results/version-sweep/runs/qwen3-14b-vllm-v0-15-0--h100-80gb--tp1pp1dp1--8192.json create mode 100644 accuracy/results/version-sweep/runs/qwen3-14b-vllm-v0-16-0--h100-80gb--tp1pp1dp1--8192.json create mode 100644 accuracy/results/version-sweep/runs/qwen3-14b-vllm-v0-17-0--h100-80gb--tp1pp1dp1--8192.json create mode 100644 accuracy/results/version-sweep/runs/qwen3-14b-vllm-v0-18-0--h100-80gb--tp1pp1dp1--8192.json diff --git a/accuracy/results/v0.19.0/accuracy_report.md b/accuracy/results/v0.19.0/accuracy_report.md index 3242079d..fa010780 100644 --- a/accuracy/results/v0.19.0/accuracy_report.md +++ b/accuracy/results/v0.19.0/accuracy_report.md @@ -7,25 +7,25 @@ ## Part 1: Accuracy Evaluation -Covers 50 runs across 29 models using only parameters the planner currently accepts as inputs. Excludes runs with `--dtype float32`, runtime `--quantization fp8`, and `--kv-cache-dtype fp8` (see Part 2). +Covers 51 runs across 32 models using only parameters the planner currently accepts as inputs. Excludes runs with `--dtype float32`, runtime `--quantization fp8`, and `--kv-cache-dtype fp8` (see Part 2). ### Summary | Metric | Mean error | Mean abs error | n | |--------|:----------:|:--------------:|:-:| -| KV cache memory (all runs) | +0.28% | +6.70% | 50 | -| KV cache memory (baseline: tp=pp=dp=1, len=8192, no quant) | -5.29% | — | 19 | -| Weight memory | -0.75% | +0.75% | 50 | -| Activation memory | +188.64% | +188.64% | 50 | -| Non-torch overhead | -43.54% | — | 50 | -| Max concurrency | -2.35% | +9.90% | 50 | +| KV cache memory (all runs) | +0.34% | +6.62% | 51 | +| KV cache memory (baseline: tp=pp=dp=1, len=8192, no quant) | -5.12% | — | 20 | +| Weight memory | -0.89% | +0.89% | 51 | +| Activation memory | +195.12% | +195.12% | 51 | +| Non-torch overhead | -44.08% | — | 51 | +| Max concurrency | +3.34% | +15.34% | 51 | **Key findings**: -- **Weight memory is accurate**: mean abs error +0.75%, computed directly from safetensors parameter counts. -- **KV cache memory is close**: +0.28% mean error across all runs; -5.29% at baseline. Errors are small and consistent. -- **Activation is the dominant error source**: mean +188.64% (over-estimate). The planner uses empirical constants measured against an older vLLM version; v0.19.0 reports substantially lower values. See Root Cause Analysis. -- **Max concurrency tracks KV accuracy**: -2.35% mean error; deviations come from the per-token KV formula, not the pool size prediction. +- **Weight memory is accurate**: mean abs error +0.89%, computed directly from safetensors parameter counts. +- **KV cache memory is close**: +0.34% mean error across all runs; -5.12% at baseline. Errors are small and consistent. +- **Activation is the dominant error source**: mean +195.12% (over-estimate). The planner uses empirical constants measured against an older vLLM version; v0.19.0 reports substantially lower values. See Root Cause Analysis. +- **Max concurrency tracks KV accuracy**: +3.34% mean error; deviations come from the per-token KV formula, not the pool size prediction. ### Per-Model Results — Baseline (TP=1, PP=1, DP=1, len=8192, no quantization) @@ -42,6 +42,7 @@ Covers 50 runs across 29 models using only parameters the planner currently acce | gemma-3-12b-it | Gemma3* | -2.61% | +39.59% | -40.00% | -0.15% | +0.00% | | gemma-3-27b-it | Gemma3* | -0.69% | +37.84% | -42.31% | -1.42% | +11.43% | | gemma-3-4b-it | Gemma3* | -6.65% | +41.39% | -40.00% | -0.27% | +2.84% | +| gemma-7b | Gemma | -0.05% | +51.52% | -40.00% | -1.79% | -1.77% | | granite-3.1-2b-instruct | Granite | -0.44% | +633.33% | -67.39% | -5.27% | -5.27% | | granite-3.1-8b-instruct | Granite | -0.20% | +547.06% | -67.39% | -6.02% | -6.03% | | granite-3.3-8b-instruct | Granite | -0.20% | +547.06% | -67.39% | -6.02% | -6.03% | @@ -108,6 +109,8 @@ The planner uses fixed constants per architecture (e.g., 4.8 GiB for Llama) cali | DeepseekV2 | 8.00 | 1.93–1.93 | +314.51% to +314.51% | | Gemma2 | 5.50 | 3.62–3.66 | +50.27% to +51.93% | | Gemma3* | 5.50 | 3.89–3.99 | +37.84% to +41.39% | +| Gemma | 5.50 | 3.63–3.63 | +51.52% to +51.52% | +| GptOss | 8.00 | 2.87–2.87 | +178.75% to +178.75% | | Granite | 5.50 | 0.75–0.85 | +547.06% to +633.33% | | KimiVL* | 8.00 | 2.85–2.92 | +173.97% to +180.70% | | Llama | 4.80 | 0.77–1.97 | +143.65% to +523.38% | @@ -115,6 +118,7 @@ The planner uses fixed constants per architecture (e.g., 4.8 GiB for Llama) cali | Mistral3* | 2.50 | 2.03–2.18 | +14.68% to +23.15% | | Mixtral | 8.00 | 1.21–1.21 | +561.16% to +561.16% | | Phi3 | 5.50 | 1.52–1.52 | +261.84% to +261.84% | +| Phi | 5.50 | 0.79–0.79 | +596.20% to +596.20% | | Qwen2 | 5.60 | 2.21–2.29 | +144.54% to +153.39% | | Qwen3 | 5.60 | 2.21–2.21 | +153.39% to +153.39% | | Qwen3Moe | 8.00 | 2.68–2.68 | +198.51% to +198.51% | @@ -128,7 +132,7 @@ Re-calibrating these constants from the v0.19.0 measurements is the highest-valu | 1 | 1 | 0.15 | 0.27 | -42.17% | | 1 | 2 | 0.15 | 0.07 | +114.29% | | 1 | 4 | 0.15 | 0.07 | +114.29% | -| 2 | 1 | 0.6 | 2.08 | -71.17% | +| 2 | 1 | 0.6 | 2.08 | -71.15% | | 4 | 1 | 0.6 | 2.17 | -72.34% | TP≥2 requires NCCL all-reduce buffers (~2.1 GiB/GPU vs the 0.60 GiB constant). PP≥2 adds P2P send/receive buffers that the formula ignores entirely. @@ -140,7 +144,7 @@ Effect: KV pool over-predicted by ~0.77 GiB (76.00 vs 75.23 GiB at 0.95 utilizat #### 4. CUDA Graph Memory -Observed pool sizes: 0.51–1.85 GiB (mean 1.03 GiB). vLLM allocates CUDA graphs after sizing the KV cache, so the reported KV pool already includes CUDA graph memory — no formula correction needed. +Observed pool sizes: 0.51–1.85 GiB (mean 1.04 GiB). vLLM allocates CUDA graphs after sizing the KV cache, so the reported KV pool already includes CUDA graph memory — no formula correction needed. --- @@ -155,7 +159,7 @@ The following vLLM flags affect memory allocation but are not yet accepted as pl | Qwen2.5-7B-Instruct | auto | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | | Qwen2.5-7B-Instruct | fp8 | 58.53 | -4.21% | 2,192,000 | 1,049,789 | -52.11% | |||||||| -| Llama-3.1-8B-Instruct | auto | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | +| Llama-3.1-8B-Instruct | auto | 42.80 | +31.06% | 175,296 | 459,509 | +162.13% | | Llama-3.1-8B-Instruct | fp8 | 58.11 | -3.47% | 952,032 | 459,509 | -51.73% | |||||||| @@ -167,7 +171,6 @@ The following vLLM flags affect memory allocation but are not yet accepted as pl |-------|:-------------------:|:----------:|:---------------:|:------:| | bfloat16 | 14.99 | -0.22% | 58.11 | -3.47% | | bfloat16 | 14.99 | -0.22% | 58.11 | -3.47% | -| bfloat16 | 14.99 | -0.22% | 58.11 | -3.47% | | float16 | 14.99 | -0.22% | 58.11 | -3.47% | | float32 | 29.98 | -50.11% | 42.80 | +31.06% | diff --git a/accuracy/results/v0.19.0/results_predicted.csv b/accuracy/results/v0.19.0/results_predicted.csv index 16a719e0..2b0da6b8 100644 --- a/accuracy/results/v0.19.0/results_predicted.csv +++ b/accuracy/results/v0.19.0/results_predicted.csv @@ -15,13 +15,13 @@ meta-llama/Llama-3.1-8B-Instruct,H100-80GB,1,1,1,8192,torch.float16,None,auto,0. meta-llama/Llama-3.1-8B-Instruct,H100-80GB,1,1,1,8192,torch.float32,None,auto,0.95,LlamaForCausalLM,Grouped-query attention,32,8,128,2,131072,131072.0,14.9575,4.8,0.15,0.0,19.9075,56.0925,459509,56.09,14.9575,56.0925 meta-llama/Llama-3.1-8B-Instruct,H100-80GB,1,1,1,2048,torch.bfloat16,None,auto,0.95,LlamaForCausalLM,Grouped-query attention,32,8,128,2,131072,131072.0,14.9575,4.8,0.15,0.0,19.9075,56.0925,459509,224.37,14.9575,56.0925 meta-llama/Llama-3.1-8B-Instruct,H100-80GB,1,1,1,4096,torch.bfloat16,None,auto,0.95,LlamaForCausalLM,Grouped-query attention,32,8,128,2,131072,131072.0,14.9575,4.8,0.15,0.0,19.9075,56.0925,459509,112.18,14.9575,56.0925 -meta-llama/Llama-3.1-8B-Instruct,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,LlamaForCausalLM,Grouped-query attention,32,8,128,2,131072,131072.0,14.9575,4.8,0.15,0.0,19.9075,56.0925,459509,56.09,14.9575,56.0925 meta-llama/Llama-3.1-8B-Instruct,H100-80GB,1,2,1,8192,torch.bfloat16,None,auto,0.95,LlamaForCausalLM,Grouped-query attention,32,8,128,2,131072,65536.0,7.4788,4.8,0.15,0.0,12.4288,65.9712,1080872,131.94,14.9575,131.9425 meta-llama/Llama-3.1-8B-Instruct,H100-80GB,1,4,1,8192,torch.bfloat16,None,auto,0.95,LlamaForCausalLM,Grouped-query attention,32,8,128,2,131072,32768.0,3.7394,4.8,0.15,0.0,8.6894,70.9106,2323599,283.64,14.9575,283.6425 meta-llama/Llama-3.1-8B-Instruct,H100-80GB,2,1,1,8192,torch.bfloat16,None,auto,0.95,LlamaForCausalLM,Grouped-query attention,32,8,128,2,131072,65536.0,7.4788,4.8,0.6,0.0,12.8788,65.5212,1073499,131.04,14.9575,131.0425 meta-llama/Llama-3.1-8B-Instruct,H100-80GB,4,1,1,8192,torch.bfloat16,None,auto,0.95,LlamaForCausalLM,Grouped-query attention,32,8,128,2,131072,32768.0,3.7394,4.8,0.6,0.0,9.1394,70.4606,2308853,281.84,14.9575,281.8425 meta-llama/Llama-3.1-8B-Instruct,H100-80GB,1,1,1,16384,torch.bfloat16,None,auto,0.95,LlamaForCausalLM,Grouped-query attention,32,8,128,2,131072,131072.0,14.9575,4.8,0.15,0.0,19.9075,56.0925,459509,28.05,14.9575,56.0925 meta-llama/Llama-3.1-8B-Instruct,H100-80GB,1,1,1,32768,torch.bfloat16,None,auto,0.95,LlamaForCausalLM,Grouped-query attention,32,8,128,2,131072,131072.0,14.9575,4.8,0.15,0.0,19.9075,56.0925,459509,14.02,14.9575,56.0925 +microsoft/phi-2,H100-80GB,1,1,1,2048,torch.float16,None,auto,0.95,PhiForCausalLM,Multi-head attention,32,32,80,2,327680,327680.0,5.1776,5.5,0.15,0.0,10.8276,65.1724,213557,104.28,5.1776,65.1724 microsoft/phi-4,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,Phi3ForCausalLM,Grouped-query attention,40,10,128,2,204800,204800.0,27.3055,5.5,0.15,0.0,32.9555,43.0445,225677,27.55,27.3055,43.0445 mistralai/Mistral-Small-3.1-24B-Instruct-2503,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,Mistral3ForConditionalGeneration,Grouped-query attention,40,8,128,2,163840,163840.0,44.7246,2.5,0.15,0.0,47.3746,28.6254,187599,22.9,44.7246,28.6254 mistralai/Mixtral-8x7B-Instruct-v0.1,H100-80GB,2,1,1,8192,torch.bfloat16,None,auto,0.95,MixtralForCausalLM,Grouped-query attention,32,8,128,2,131072,65536.0,43.4954,8.0,0.6,0.0,52.0954,27.9046,457189,55.81,86.9907,55.8093 @@ -29,6 +29,7 @@ moonshotai/Kimi-Dev-72B,H100-80GB,2,1,1,8192,torch.bfloat16,None,auto,0.95,Qwen2 moonshotai/Kimi-Dev-72B,H100-80GB,4,1,1,8192,torch.bfloat16,None,auto,0.95,Qwen2ForCausalLM,Grouped-query attention,80,8,128,2,327680,81920.0,33.8565,5.6,0.6,0.0,40.0565,40.1435,526169,64.23,135.4259,160.5741 moonshotai/Kimi-VL-A3B-Instruct,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,KimiVLForConditionalGeneration,Multi-head attention,27,16,128,2,221184,221184.0,30.5616,8.0,0.15,0.0,38.7116,37.2884,181017,22.1,30.5616,37.2884 moonshotai/Kimi-VL-A3B-Instruct,H100-80GB,2,1,1,8192,torch.bfloat16,None,auto,0.95,KimiVLForConditionalGeneration,Multi-head attention,27,16,128,2,221184,110592.0,15.2808,8.0,0.6,0.0,23.8808,56.1192,544863,66.51,30.5616,112.2384 +openai/gpt-oss-20b,H100-80GB,2,1,1,8192,torch.bfloat16,mxfp4,auto,0.9,GptOssForCausalLM,Grouped-query attention,24,8,64,0.53125,13056,6528.0,6.4081,8.0,0.6,0.0,15.0081,60.9919,10032102,1224.62,12.8162,121.9838 Qwen/Qwen2.5-7B-Instruct,H100-80GB,1,1,1,8192,torch.bfloat16,None,fp8,0.95,Qwen2ForCausalLM,Grouped-query attention,28,4,128,2,57344,57344.0,14.1852,5.6,0.15,0.0,19.9352,56.0648,1049789,128.15,14.1852,56.0648 Qwen/Qwen2.5-72B-Instruct,H100-80GB,2,1,1,8192,torch.bfloat16,None,auto,0.95,Qwen2ForCausalLM,Grouped-query attention,80,8,128,2,327680,163840.0,67.7129,5.6,0.6,0.0,73.9129,4.8871,32027,3.91,135.4259,9.7741 Qwen/Qwen2.5-7B-Instruct,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,Qwen2ForCausalLM,Grouped-query attention,28,4,128,2,57344,57344.0,14.1852,5.6,0.15,0.0,19.9352,56.0648,1049789,128.15,14.1852,56.0648 @@ -36,7 +37,6 @@ Qwen/Qwen2.5-7B-Instruct,H100-80GB,1,1,1,16384,torch.bfloat16,None,auto,0.95,Qwe Qwen/Qwen2.5-7B-Instruct,H100-80GB,1,1,1,32768,torch.bfloat16,None,auto,0.95,Qwen2ForCausalLM,Grouped-query attention,28,4,128,2,57344,57344.0,14.1852,5.6,0.15,0.0,19.9352,56.0648,1049789,32.04,14.1852,56.0648 Qwen/Qwen2.5-7B-Instruct,H100-80GB,1,1,1,2048,torch.bfloat16,None,auto,0.95,Qwen2ForCausalLM,Grouped-query attention,28,4,128,2,57344,57344.0,14.1852,5.6,0.15,0.0,19.9352,56.0648,1049789,512.59,14.1852,56.0648 Qwen/Qwen2.5-7B-Instruct,H100-80GB,1,1,1,4096,torch.bfloat16,None,auto,0.95,Qwen2ForCausalLM,Grouped-query attention,28,4,128,2,57344,57344.0,14.1852,5.6,0.15,0.0,19.9352,56.0648,1049789,256.3,14.1852,56.0648 -Qwen/Qwen2.5-7B-Instruct,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,Qwen2ForCausalLM,Grouped-query attention,28,4,128,2,57344,57344.0,14.1852,5.6,0.15,0.0,19.9352,56.0648,1049789,128.15,14.1852,56.0648 Qwen/Qwen2.5-7B-Instruct,H100-80GB,2,1,1,8192,torch.bfloat16,None,auto,0.95,Qwen2ForCausalLM,Grouped-query attention,28,4,128,2,57344,28672.0,7.0926,5.6,0.6,0.0,13.2926,65.5074,2453196,299.46,14.1852,131.0148 Qwen/Qwen2.5-7B-Instruct,H100-80GB,4,1,1,8192,torch.bfloat16,None,auto,0.95,Qwen2ForCausalLM,Grouped-query attention,28,4,128,2,57344,14336.0,3.5463,5.6,0.6,0.0,9.7463,70.4537,5276861,644.15,14.1852,281.8148 Qwen/Qwen3-30B-A3B,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,Qwen3MoeForCausalLM,Grouped-query attention,48,4,128,2,98304,98304.0,56.8705,8.0,0.15,0.0,65.0205,10.9795,119925,14.64,56.8705,10.9795 @@ -53,3 +53,4 @@ google/gemma-2-9b-it,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,Gemma2Fo google/gemma-3-12b-it,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,Gemma3ForConditionalGeneration,Grouped-query attention,48,8,256,2,393216,393216.0,22.7007,5.5,0.15,0.0,28.3507,47.6493,130114,15.88,22.7007,47.6493 google/gemma-3-27b-it,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,Gemma3ForConditionalGeneration,Grouped-query attention,62,16,128,2,507904,507904.0,51.0968,5.5,0.15,0.0,56.7468,19.2532,40702,4.97,51.0968,19.2532 google/gemma-3-4b-it,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,Gemma3ForConditionalGeneration,Grouped-query attention,34,4,256,2,139264,139264.0,8.0095,5.5,0.15,0.0,13.6595,62.3405,480652,58.67,8.0095,62.3405 +google/gemma-7b,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,GemmaForCausalLM,Multi-head attention,28,16,256,2,458752,458752.0,15.9027,5.5,0.15,0.0,21.5527,54.4473,127437,15.56,15.9027,54.4473 diff --git a/accuracy/results/v0.19.0/results_raw.csv b/accuracy/results/v0.19.0/results_raw.csv index fe20222e..7c0b7d90 100644 --- a/accuracy/results/v0.19.0/results_raw.csv +++ b/accuracy/results/v0.19.0/results_raw.csv @@ -17,6 +17,7 @@ google-gemma-3-27b-it--h100-80gb--tp1pp1dp1--8192.FAILED.log,failed,,H100-80GB,, google-gemma-3-27b-it--h100-80gb--tp1pp1dp1--8192.log,ok,google/gemma-3-27b-it,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,51.45,51.45,3.99,0.26,55.7,1.05,1.14,19.53,36560,15997,4.46,78.68,79.19,51,512,51 google-gemma-3-4b-it--h100-80gb--tp1pp1dp1--8192.FAILED.log,failed,,H100-80GB,,,,,,,,0.95,,,,,,,,,,,,,,,,,,,,, google-gemma-3-4b-it--h100-80gb--tp1pp1dp1--8192.log,ok,google/gemma-3-4b-it,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,8.58,8.58,3.89,0.25,12.72,0.7,0.67,62.51,468144,204817,57.05,78.68,79.19,51,512,51 +google-gemma-7b--h100-80gb--tp1pp1dp1--8192.log,ok,google/gemma-7b,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,15.91,15.91,3.63,0.25,19.79,0.84,0.94,55.44,129760,8110,15.84,78.68,79.19,51,512,51 granite-3-1-2b--h100-80gb--tp1pp1dp1--8192.log,ok,ibm-granite/granite-3.1-2b-instruct,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,4.74,4.74,0.75,0.46,5.95,1.6,0.84,69.28,908048,56753,110.85,78.68,79.19,51,512,51 granite-3-1-8b--h100-80gb--tp1pp1dp1--8192.log,ok,ibm-granite/granite-3.1-8b-instruct,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,15.25,15.25,0.85,0.46,16.57,0.74,0.98,58.66,384432,24027,46.93,78.68,79.19,51,512,51 ibm-granite-granite-3-3---h100-80gb--tp1pp1dp1--8192.log,ok,ibm-granite/granite-3.3-8b-instruct,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,15.25,15.25,0.85,0.46,16.57,0.74,0.98,58.66,384432,24027,46.93,78.68,79.19,51,512,51 @@ -37,6 +38,8 @@ meta-llama-llama-3-1-8b---h100-80gb--tp4pp1dp1--8192.log,ok,meta-llama/Llama-3.1 meta-llama-llama-3-1-8b--h100-80gb--tp1pp1dp1--16384.log,ok,meta-llama/Llama-3.1-8B-Instruct,H100-80GB,1,1,1,16384,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,14.99,14.99,1.89,0.25,17.12,0.84,0.91,58.11,476016,29751,29.05,78.68,79.19,51,512,51 meta-llama-llama-3-1-8b--h100-80gb--tp1pp1dp1--32768.log,ok,meta-llama/Llama-3.1-8B-Instruct,H100-80GB,1,1,1,32768,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,14.99,14.99,1.89,0.25,17.12,0.84,0.91,58.11,476016,29751,14.53,78.68,79.19,51,512,51 meta-llama-llama-4-scout--h100-80gb--tp4pp1dp1--8192.FAILED.log,failed,meta-llama/Llama-4-Scout-17B-16E-Instruct,H100-80GB,4,1,1,8192,torch.bfloat16,None,auto,0.95,77.12,79.19,2.07,2.05,75.23,,,,,,,,,,,,,,,, +microsoft-phi-2--h100-80gb--tp1pp1dp1--2048.log,ok,microsoft/phi-2,H100-80GB,1,1,1,2048,torch.float16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,5.19,5.19,0.79,0.24,6.23,0.55,0.76,69.0,226112,14132,110.41,78.68,79.19,51,512,51 +microsoft-phi-2--h100-80gb--tp1pp1dp1--8192.FAILED.log,failed,,H100-80GB,,,,,,,,0.95,,,,,,,,,,,,,,,,,,,,, microsoft-phi-4--h100-80gb--tp1pp1dp1--8192.log,ok,microsoft/phi-4,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,27.39,27.39,1.52,0.25,29.15,0.81,1.07,46.08,241568,15098,29.49,78.68,79.19,51,512,51 mistralai-mistral-small---h100-80gb--tp1pp1dp1--8192.log,ok,mistralai/Mistral-Small-3.1-24B-Instruct-2503,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,44.76,44.76,2.03,0.25,47.04,0.84,1.06,28.19,184752,11547,22.55,78.68,79.19,51,512,51 mistralai-mixtral-8x7b-i--h100-80gb--tp2pp1dp1--8192.log,ok,mistralai/Mixtral-8x7B-Instruct-v0.1,H100-80GB,2,1,1,8192,torch.bfloat16,None,auto,0.95,77.18,79.19,2.01,1.99,75.23,43.51,43.51,1.21,2.07,46.79,0.82,0.93,28.44,465936,29121,56.88,77.18,79.19,51,512,51 @@ -46,6 +49,8 @@ moonshotai-kimi-dev-72b--h100-80gb--tp4pp1dp1--8192.log,ok,moonshotai/Kimi-Dev-7 moonshotai-kimi-vl-a3b-i--h100-80gb--tp1pp1dp1--8192.log,ok,moonshotai/Kimi-VL-A3B-Instruct,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,30.74,30.74,2.92,0.25,33.91,1.19,1.5,41.32,1426368,89148,174.12,78.68,79.19,51,512,51 moonshotai-kimi-vl-a3b-i--h100-80gb--tp2pp1dp1--8192.log,ok,moonshotai/Kimi-VL-A3B-Instruct,H100-80GB,2,1,1,8192,torch.bfloat16,None,auto,0.95,77.18,79.19,2.01,1.99,75.23,15.53,15.53,2.85,2.07,20.45,1.04,1.26,54.78,1890896,118181,230.82,77.18,79.19,51,512,51 openai-gpt-oss-20b--h100-80gb--tp1pp1dp1--8192.FAILED.log,failed,openai/gpt-oss-20b,H100-80GB,1,1,1,8192,torch.bfloat16,mxfp4,auto,0.95,78.68,79.19,0.51,0.51,75.23,13.64,13.64,2.87,0.25,16.76,1.3,1.84,58.47,1277184,,155.75,78.68,79.19,83,1024,83 +openai-gpt-oss-20b--h100-80gb--tp2pp1dp1--8192.FAILED.log,failed,openai/gpt-oss-20b,H100-80GB,2,1,1,8192,torch.bfloat16,mxfp4,auto,0.95,77.18,79.19,2.01,1.99,75.23,7.01,7.01,2.87,2.07,11.95,1.09,1.55,63.28,2764768,,337.17,77.18,79.19,83,1024,83 +openai-gpt-oss-20b--h100-80gb--tp2pp1dp1--8192.log,ok,openai/gpt-oss-20b,H100-80GB,2,1,1,8192,torch.bfloat16,mxfp4,auto,0.9,77.18,79.19,2.01,1.99,71.27,7.01,7.01,2.87,2.07,11.95,1.09,1.55,59.32,2591776,323972,316.07,77.18,79.19,83,1024,83 qwen-qwen2---h100-80gb--tp1pp1dp1--8192-dtbf16-kvfp8.log,ok,Qwen/Qwen2.5-7B-Instruct,H100-80GB,1,1,1,8192,torch.bfloat16,None,fp8,0.95,78.68,79.19,0.51,0.51,75.23,14.25,14.25,2.21,0.24,16.7,0.61,0.86,58.53,2192000,137000,267.58,78.68,79.19,51,512,51 qwen-qwen2-5-72b-instruc--h100-80gb--tp2pp1dp1--8192.log,ok,Qwen/Qwen2.5-72B-Instruct,H100-80GB,2,1,1,8192,torch.bfloat16,None,auto,0.95,77.66,79.19,1.53,1.51,75.23,67.8,67.8,2.29,2.09,72.19,1.63,1.61,3.04,19920,1245,2.43,77.66,79.19,51,512,51 qwen-qwen2-5-7b-i--h100-80gb--tp1pp1dp1--8192-dtbf16.log,ok,Qwen/Qwen2.5-7B-Instruct,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,14.25,14.25,2.21,0.24,16.7,0.61,0.87,58.53,1096000,68500,133.79,78.68,79.19,51,512,51 diff --git a/accuracy/results/v0.19.0/run_matrix.md b/accuracy/results/v0.19.0/run_matrix.md index 40c7678c..21e96b2e 100644 --- a/accuracy/results/v0.19.0/run_matrix.md +++ b/accuracy/results/v0.19.0/run_matrix.md @@ -1,8 +1,8 @@ # Run Matrix — vLLM v0.19.0 / H100-80GB -**52 successful runs, 6 failed runs.** +**55 successful runs, 8 failed runs.** -Quantization abbreviations: `ct` = compressed-tensors, `gptq` = gptq_marlin, `fp8` = fp8 inline, `—` = none. +Quantization abbreviations: `ct` = compressed-tensors, `gptq` = gptq_marlin, `fp8` = fp8 inline, `mxfp4` = mx-format fp4, `—` = none. ## Successful Runs @@ -16,6 +16,7 @@ Quantization abbreviations: `ct` = compressed-tensors, `gptq` = gptq_marlin, `fp | google/gemma-3-12b-it | 1 | 1 | 1 | 8192 | bf16 | — | auto | -2.6% | +39.6% | -40.0% | -0.1% | | google/gemma-3-27b-it | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.7% | +37.8% | -42.3% | -1.4% | | google/gemma-3-4b-it | 1 | 1 | 1 | 8192 | bf16 | — | auto | -6.6% | +41.4% | -40.0% | -0.3% | +| google/gemma-7b | 1 | 1 | 1 | 8192 | bf16 | — | auto | +0.0% | -34.0% | +66.7% | +1.8% | | ibm-granite/granite-3.1-2b-instruct | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.4% | +633.3% | -67.4% | -5.3% | | ibm-granite/granite-3.1-8b-instruct | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.2% | +547.1% | -67.4% | -6.0% | | ibm-granite/granite-3.3-8b-instruct | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.2% | +547.1% | -67.4% | -6.0% | @@ -33,6 +34,7 @@ Quantization abbreviations: `ct` = compressed-tensors, `gptq` = gptq_marlin, `fp | meta-llama/Llama-3.1-8B-Instruct | 4 | 1 | 1 | 8192 | bf16 | — | auto | -0.8% | +154.0% | -71.8% | +4.5% | | meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 16384 | bf16 | — | auto | -0.2% | +154.0% | -40.0% | -3.5% | | meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 32768 | bf16 | — | auto | -0.2% | +154.0% | -40.0% | -3.5% | +| microsoft/phi-2 | 1 | 1 | 1 | 2048 | f16 | — | auto | +0.2% | -85.6% | +60.0% | +5.9% | | microsoft/phi-4 | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.3% | +261.8% | -40.0% | -6.6% | | mistralai/Mistral-Small-3.1-24B-Instruct-2503 | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.1% | +23.2% | -40.0% | +1.5% | | mistralai/Mixtral-8x7B-Instruct-v0.1 | 2 | 1 | 1 | 8192 | bf16 | — | auto | -0.0% | +561.2% | -71.0% | -1.9% | @@ -40,6 +42,7 @@ Quantization abbreviations: `ct` = compressed-tensors, `gptq` = gptq_marlin, `fp | moonshotai/Kimi-Dev-72B | 4 | 1 | 1 | 8192 | bf16 | — | auto | -0.5% | +144.5% | -72.9% | +9.3% | | moonshotai/Kimi-VL-A3B-Instruct | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.6% | +174.0% | -40.0% | -9.8% | | moonshotai/Kimi-VL-A3B-Instruct | 2 | 1 | 1 | 8192 | bf16 | — | auto | -1.6% | +180.7% | -71.0% | +2.4% | +| openai/gpt-oss-20b | 2 | 1 | 1 | 8192 | bf16 | mxfp4 | auto | +9.4% | -64.1% | +245.0% | -2.7% | | Qwen/Qwen2.5-72B-Instruct | 2 | 1 | 1 | 8192 | bf16 | — | auto | -0.1% | +144.5% | -71.3% | +60.8% | | Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 8192 | bf16 | — | fp8 | -0.5% | +153.4% | -37.5% | -4.2% | | Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.5% | +153.4% | -37.5% | -4.2% | @@ -68,8 +71,10 @@ Quantization abbreviations: `ct` = compressed-tensors, `gptq` = gptq_marlin, `fp | codellama/CodeLlama-34b-hf | 2 | 1 | 1 | 8192 | GPU contention at runtime | | meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 2 | 8192 | DP=2 | | meta-llama/Llama-4-Scout-17B-16E-Instruct | 4 | 1 | 1 | 8192 | | +| microsoft/phi-2 | 1 | 1 | 1 | 8192 | max_model_len=8192 > max_position_embeddings=2048; fixed with max_model_len=2048 | | moonshotai/Kimi-Dev-72B | 4 | 1 | 1 | 8192 | second attempt; tp=2 succeeded | -| openai/gpt-oss-20b | 1 | 1 | 1 | 8192 | | +| openai/gpt-oss-20b | 1 | 1 | 1 | 8192 | sampler warmup OOM (~786 MiB needed, <552 MiB free) | +| openai/gpt-oss-20b | 2 | 1 | 1 | 8192 | sampler warmup OOM at gmu=0.95; succeeded at gmu=0.90 | | Qwen/Qwen3-14B | 5 | 1 | 1 | 8192 | tp=5 invalid (vocab not divisible by 5) | ## Calibration decisions diff --git a/accuracy/results/v0.19.0/runs/google-gemma-7b--h100-80gb--tp1pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/google-gemma-7b--h100-80gb--tp1pp1dp1--8192.json new file mode 100644 index 00000000..4d2885bc --- /dev/null +++ b/accuracy/results/v0.19.0/runs/google-gemma-7b--h100-80gb--tp1pp1dp1--8192.json @@ -0,0 +1,29 @@ +{ + "model": "google/gemma-7b", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-22T19:58:16.586011+00:00", + "log_path": "/data/results/v0.19.0/logs/google-gemma-7b--h100-80gb--tp1pp1dp1--8192.log", + "weight_memory_gib": 15.91, + "kv_cache_memory_gib": 55.44, + "cuda_graph_memory_gib": 0.84, + "max_concurrency": 15.84, + "kv_cache_tokens": 129760, + "vllm_version": null, + "vllm_commit": null, + "total_non_kv_memory_gib": 19.79, + "activation_memory_gib": 3.63, + "non_torch_forward_memory_gib": 0.25, + "profiling_weights_memory_gib": 15.91, + "kv_cache_blocks": 8110, + "kv_block_size_bytes": 7340104 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/microsoft-phi-2--h100-80gb--tp1pp1dp1--2048.json b/accuracy/results/v0.19.0/runs/microsoft-phi-2--h100-80gb--tp1pp1dp1--2048.json new file mode 100644 index 00000000..b60600b8 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/microsoft-phi-2--h100-80gb--tp1pp1dp1--2048.json @@ -0,0 +1,29 @@ +{ + "model": "microsoft/phi-2", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 2048, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-22T20:02:09.081677+00:00", + "log_path": "/data/results/v0.19.0/logs/microsoft-phi-2--h100-80gb--tp1pp1dp1--2048.log", + "weight_memory_gib": 5.19, + "kv_cache_memory_gib": 69.0, + "cuda_graph_memory_gib": 0.55, + "max_concurrency": 110.41, + "kv_cache_tokens": 226112, + "vllm_version": null, + "vllm_commit": null, + "total_non_kv_memory_gib": 6.23, + "activation_memory_gib": 0.79, + "non_torch_forward_memory_gib": 0.24, + "profiling_weights_memory_gib": 5.19, + "kv_cache_blocks": 14132, + "kv_block_size_bytes": 5242583 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/openai-gpt-oss-20b--h100-80gb--tp2pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/openai-gpt-oss-20b--h100-80gb--tp2pp1dp1--8192.json new file mode 100644 index 00000000..94c8549c --- /dev/null +++ b/accuracy/results/v0.19.0/runs/openai-gpt-oss-20b--h100-80gb--tp2pp1dp1--8192.json @@ -0,0 +1,29 @@ +{ + "model": "openai/gpt-oss-20b", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 2, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.9, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-22T20:00:17.727316+00:00", + "log_path": "/data/results/v0.19.0/logs/openai-gpt-oss-20b--h100-80gb--tp2pp1dp1--8192.log", + "weight_memory_gib": 7.01, + "kv_cache_memory_gib": 59.32, + "cuda_graph_memory_gib": 1.09, + "max_concurrency": 316.07, + "kv_cache_tokens": 2591776, + "vllm_version": null, + "vllm_commit": null, + "total_non_kv_memory_gib": 11.95, + "activation_memory_gib": 2.87, + "non_torch_forward_memory_gib": 2.07, + "profiling_weights_memory_gib": 7.01, + "kv_cache_blocks": 161986, + "kv_block_size_bytes": 393209 +} \ No newline at end of file diff --git a/accuracy/results/version-sweep/run_matrix.md b/accuracy/results/version-sweep/run_matrix.md new file mode 100644 index 00000000..df539830 --- /dev/null +++ b/accuracy/results/version-sweep/run_matrix.md @@ -0,0 +1,31 @@ +# Run Matrix — vLLM Version Sensitivity / Qwen3-14B / H100-80GB + +**Goal**: Track how activation memory reported by vLLM changes across releases, to identify +when planner constants became stale. + +**4 successful runs, 1 failed run (first attempt only).** + +Model: `Qwen/Qwen3-14B` — tp=1, pp=1, dp=1, max_model_len=8192, dtype=auto, quant=none. +All runs on a single H100-80GB at `gpu_memory_utilization=0.95`. + +## Results + +| vLLM version | Weight (GiB) | Activation (GiB) | Non-torch (GiB) | KV cache (GiB) | Max concurrency | +|:---:|:---:|:---:|:---:|:---:|:---:| +| v0.15.0 | 27.52 | 5.64 | 0.13 | 41.94 | 33.55 | +| v0.16.0 | 27.52 | 5.64 | 0.13 | 41.94 | 33.55 | +| **v0.17.0** | 27.52 | **2.23** | 0.13 | 45.34 | 36.27 | +| v0.18.0 | 27.52 | 2.23 | 0.25 | 45.23 | 36.18 | + +**Key finding**: Activation memory dropped from 5.64 GiB to 2.23 GiB (−60%) between v0.16.0 and v0.17.0. +Weight memory is stable across all versions (as expected — model parameters don't change). +KV cache increased by ~3.4 GiB at v0.17.0+ because lower activation overhead leaves more headroom. + +The planner's Qwen3 activation constant (5.60 GiB) matches v0.16.0 exactly — the constants +were calibrated against v0.16.0 or earlier. + +## Failed Runs + +| vLLM version | Notes | +|:---:|---| +| v0.16.0 (attempt 1) | GPU contention at startup: only 61.8/79.19 GiB free (needed 75.23 GiB). Succeeded on retry. | diff --git a/accuracy/results/version-sweep/runs/qwen3-14b-vllm-v0-15-0--h100-80gb--tp1pp1dp1--8192.json b/accuracy/results/version-sweep/runs/qwen3-14b-vllm-v0-15-0--h100-80gb--tp1pp1dp1--8192.json new file mode 100644 index 00000000..045271cf --- /dev/null +++ b/accuracy/results/version-sweep/runs/qwen3-14b-vllm-v0-15-0--h100-80gb--tp1pp1dp1--8192.json @@ -0,0 +1,29 @@ +{ + "model": "Qwen/Qwen3-14B", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-22T19:31:24.972860+00:00", + "log_path": "/data/results/version-sweep/unknown/logs/qwen3-14b-vllm-v0-15-0--h100-80gb--tp1pp1dp1--8192.log", + "weight_memory_gib": 27.52, + "kv_cache_memory_gib": 41.94, + "max_concurrency": 33.55, + "kv_cache_tokens": 274848, + "vllm_version": null, + "vllm_commit": null, + "total_non_kv_memory_gib": 33.29, + "activation_memory_gib": 5.64, + "non_torch_forward_memory_gib": 0.13, + "profiling_weights_memory_gib": 27.52, + "kv_cache_blocks": 17178, + "kv_block_size_bytes": 2621535, + "_sweep_dim": "vllm_version" +} \ No newline at end of file diff --git a/accuracy/results/version-sweep/runs/qwen3-14b-vllm-v0-16-0--h100-80gb--tp1pp1dp1--8192.json b/accuracy/results/version-sweep/runs/qwen3-14b-vllm-v0-16-0--h100-80gb--tp1pp1dp1--8192.json new file mode 100644 index 00000000..ca8369c2 --- /dev/null +++ b/accuracy/results/version-sweep/runs/qwen3-14b-vllm-v0-16-0--h100-80gb--tp1pp1dp1--8192.json @@ -0,0 +1,29 @@ +{ + "model": "Qwen/Qwen3-14B", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-22T19:45:05.746689+00:00", + "log_path": "/data/results/version-sweep/unknown/logs/qwen3-14b-vllm-v0-16-0--h100-80gb--tp1pp1dp1--8192.log", + "weight_memory_gib": 27.52, + "kv_cache_memory_gib": 41.94, + "max_concurrency": 33.55, + "kv_cache_tokens": 274848, + "vllm_version": null, + "vllm_commit": null, + "total_non_kv_memory_gib": 33.29, + "activation_memory_gib": 5.64, + "non_torch_forward_memory_gib": 0.13, + "profiling_weights_memory_gib": 27.52, + "kv_cache_blocks": 17178, + "kv_block_size_bytes": 2621535, + "_sweep_dim": "vllm_version" +} \ No newline at end of file diff --git a/accuracy/results/version-sweep/runs/qwen3-14b-vllm-v0-17-0--h100-80gb--tp1pp1dp1--8192.json b/accuracy/results/version-sweep/runs/qwen3-14b-vllm-v0-17-0--h100-80gb--tp1pp1dp1--8192.json new file mode 100644 index 00000000..52f3a7cd --- /dev/null +++ b/accuracy/results/version-sweep/runs/qwen3-14b-vllm-v0-17-0--h100-80gb--tp1pp1dp1--8192.json @@ -0,0 +1,29 @@ +{ + "model": "Qwen/Qwen3-14B", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-22T19:39:05.997480+00:00", + "log_path": "/data/results/version-sweep/unknown/logs/qwen3-14b-vllm-v0-17-0--h100-80gb--tp1pp1dp1--8192.log", + "weight_memory_gib": 27.52, + "kv_cache_memory_gib": 45.34, + "max_concurrency": 36.27, + "kv_cache_tokens": 297152, + "vllm_version": null, + "vllm_commit": null, + "total_non_kv_memory_gib": 29.89, + "activation_memory_gib": 2.23, + "non_torch_forward_memory_gib": 0.13, + "profiling_weights_memory_gib": 27.52, + "kv_cache_blocks": 18572, + "kv_block_size_bytes": 2621336, + "_sweep_dim": "vllm_version" +} \ No newline at end of file diff --git a/accuracy/results/version-sweep/runs/qwen3-14b-vllm-v0-18-0--h100-80gb--tp1pp1dp1--8192.json b/accuracy/results/version-sweep/runs/qwen3-14b-vllm-v0-18-0--h100-80gb--tp1pp1dp1--8192.json new file mode 100644 index 00000000..76cea9bb --- /dev/null +++ b/accuracy/results/version-sweep/runs/qwen3-14b-vllm-v0-18-0--h100-80gb--tp1pp1dp1--8192.json @@ -0,0 +1,30 @@ +{ + "model": "Qwen/Qwen3-14B", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-22T19:40:26.487283+00:00", + "log_path": "/data/results/version-sweep/unknown/logs/qwen3-14b-vllm-v0-18-0--h100-80gb--tp1pp1dp1--8192.log", + "weight_memory_gib": 27.52, + "kv_cache_memory_gib": 45.23, + "cuda_graph_memory_gib": 0.9, + "max_concurrency": 36.18, + "kv_cache_tokens": 296400, + "vllm_version": null, + "vllm_commit": null, + "total_non_kv_memory_gib": 30.0, + "activation_memory_gib": 2.23, + "non_torch_forward_memory_gib": 0.25, + "profiling_weights_memory_gib": 27.52, + "kv_cache_blocks": 18525, + "kv_block_size_bytes": 2621610, + "_sweep_dim": "vllm_version" +} \ No newline at end of file diff --git a/accuracy/scripts/sweep.yaml b/accuracy/scripts/sweep.yaml index 581fe600..24594b8e 100644 --- a/accuracy/scripts/sweep.yaml +++ b/accuracy/scripts/sweep.yaml @@ -67,10 +67,19 @@ runs: - model: mistralai/Mixtral-8x7B-Instruct-v0.1 # 56B total, 14B active MoE tp: 2 # tp=1 OOM: ~87 GiB weights exceed single H100 80GB - - model: openai/gpt-oss-20b # 20B dense; tp=1 OOM during CUDA graph warmup + - model: openai/gpt-oss-20b # 20B dense; sampler warmup needs ~786 MiB/GPU beyond KV; gmu=0.90 leaves headroom tp: 2 + gpu_memory_utilization: "0.90" # gpt-oss-120b skipped — OOM or infra unavailability at all tested tp values + - model: microsoft/phi-2 # 2.7B dense; Phi architecture; max ctx = 2048 + tp: 1 + max_model_len: 2048 + + - model: google/gemma-7b # 7B dense; Gemma architecture (gated) + tp: 1 + hf_token_secret: hf-token-gemma + - model: Qwen/Qwen2.5-7B-Instruct # 7B dense; reference model for sensitivity sweeps tp: 1 From 3c876c8a84d11e11c7092d5dc76ffeb47a3115ba Mon Sep 17 00:00:00 2001 From: Jing Chen Date: Wed, 22 Apr 2026 21:38:34 -0400 Subject: [PATCH 09/24] More coverage and detailed report Signed-off-by: Jing Chen --- accuracy/accuracy_report.md | 456 ++++++++++++++++++ accuracy/results/v0.19.0/accuracy_report.md | 200 -------- .../results/v0.19.0/results_predicted.csv | 59 ++- accuracy/results/v0.19.0/results_raw.csv | 3 + accuracy/results/v0.19.0/run_matrix.md | 82 ---- ...a-4-scout--h100-80gb--tp4pp1dp1--8192.json | 29 ++ ...moe-a2-7b--h100-80gb--tp1pp1dp1--8192.json | 29 ++ accuracy/results/version-sweep/run_matrix.md | 31 -- accuracy/scripts/sweep.yaml | 9 + 9 files changed, 564 insertions(+), 334 deletions(-) create mode 100644 accuracy/accuracy_report.md delete mode 100644 accuracy/results/v0.19.0/accuracy_report.md delete mode 100644 accuracy/results/v0.19.0/run_matrix.md create mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-4-scout--h100-80gb--tp4pp1dp1--8192.json create mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen1-5-moe-a2-7b--h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/version-sweep/run_matrix.md diff --git a/accuracy/accuracy_report.md b/accuracy/accuracy_report.md new file mode 100644 index 00000000..2e7e0af0 --- /dev/null +++ b/accuracy/accuracy_report.md @@ -0,0 +1,456 @@ +# Capacity Planner Accuracy Report + +**Hardware**: H100-80GB (catalog 80 GiB, physical 79.19 GiB) +**vLLM version evaluated**: v0.19.0 +**Planner inputs evaluated**: model, tp, pp, dp, max_model_len, gpu_memory_utilization + +> Percent error = (predicted − actual) / actual × 100. Positive = over-estimate, negative = under-estimate. + +--- + +## Part 1: Accuracy Evaluation — vLLM v0.19.0 + +Covers 53 runs across 34 models using only parameters the planner currently accepts as inputs. Excludes runs with `--dtype float32`, runtime `--quantization fp8`, and `--kv-cache-dtype fp8` (see Part 3). + +### Summary + +| Metric | Mean error | Mean abs error | n | +|--------|:----------:|:--------------:|:-:| +| KV cache memory (all runs) | +0.34% | +6.62% | 52 | +| KV cache memory (baseline: tp=pp=dp=1, len=8192, no quant) | -5.12% | — | 21 | +| Weight memory | -0.89% | +0.89% | 52 | +| Activation memory | +195.12% | +195.12% | 52 | +| Non-torch overhead | -44.08% | — | 52 | +| Max concurrency | +3.34% | +15.34% | 52 | + +**Key findings**: + +- **Weight memory is accurate**: mean abs error +0.89%, computed directly from safetensors parameter counts. +- **KV cache memory is close**: +0.34% mean error across all runs; -5.12% at baseline. Errors are small and consistent. +- **Activation is the dominant error source**: mean +195.12% (over-estimate). The planner uses empirical constants calibrated against vLLM v0.16.0 or earlier; v0.17.0 introduced a ~60% reduction in reported activation overhead that persists through v0.19.0. See Part 2. +- **Max concurrency tracks KV accuracy**: +3.34% mean error; deviations come from the per-token KV formula, not the pool size prediction. + +### Per-Model Results — Baseline (TP=1, PP=1, DP=1, len=8192, no quantization) + +| Model | Arch | Weight err | Activation err | Non-torch err | KV cache err | Max conc err | +|-------|------|:----------:|:--------------:|:-------------:|:------------:|:------------:| +| Qwen2.5-7B-Instruct | Qwen2 | -0.45% | +153.39% | -37.50% | -4.21% | -4.22% | +| Qwen3-30B-A3B | Qwen3Moe | -0.02% | +198.51% | -44.44% | -28.75% | -28.72% | +| Qwen3-8B | Qwen3 | -0.09% | +153.39% | -40.00% | -4.36% | -4.36% | +| CodeLlama-7b-hf | Llama | -0.07% | +523.38% | -40.00% | -5.13% | -5.13% | +| DeepSeek-V2-Lite-Chat | DeepseekV2 | -0.59% | +314.51% | -42.31% | -11.50% | -11.50% | +| gemma-2-27b-it | Gemma2 | -0.01% | +50.27% | -42.31% | -4.64% | -4.61% | +| gemma-2-2b-it | Gemma2 | -0.62% | +51.93% | -37.50% | -1.49% | -1.39% | +| gemma-2-9b-it | Gemma2 | -0.03% | +50.68% | -40.00% | -1.82% | -1.75% | +| gemma-3-12b-it | Gemma3* | -2.61% | +39.59% | -40.00% | -0.15% | +0.00% | +| gemma-3-27b-it | Gemma3* | -0.69% | +37.84% | -42.31% | -1.42% | +11.43% | +| gemma-3-4b-it | Gemma3* | -6.65% | +41.39% | -40.00% | -0.27% | +2.84% | +| gemma-7b | Gemma | -0.05% | +51.52% | -40.00% | -1.79% | -1.77% | +| granite-3.1-2b-instruct | Granite | -0.44% | +633.33% | -67.39% | -5.27% | -5.27% | +| granite-3.1-8b-instruct | Granite | -0.20% | +547.06% | -67.39% | -6.02% | -6.03% | +| granite-3.3-8b-instruct | Granite | -0.20% | +547.06% | -67.39% | -6.02% | -6.03% | +| granite-vision-3.3-2b | LlavaNext* | +0.04% | +216.46% | -40.00% | -1.23% | -1.23% | +| Llama-3.1-8B-Instruct | Llama | -0.22% | +153.97% | -40.00% | -3.47% | -3.48% | +| phi-4 | Phi3 | -0.31% | +261.84% | -40.00% | -6.59% | -6.58% | +| Mistral-Small-3.1-24B-Instruct-2503 | Mistral3* | -0.08% | +23.15% | -40.00% | +1.54% | +1.55% | +| Qwen1.5-MoE-A2.7B | Qwen2Moe | -0.02% | +223.89% | -40.00% | -10.14% | -10.12% | +| Kimi-VL-A3B-Instruct | KimiVL* | -0.58% | +173.97% | -40.00% | -9.76% | -87.31% | + +### Sensitivity: Tensor Parallelism (TP) + +| Model | TP | Actual weight (GiB) | Weight err | Actual activ (GiB) | Activation err | Actual non-torch (GiB) | Non-torch err | KV cache err | +|-------|:--:|:-------------------:|:----------:|:------------------:|:--------------:|:---------------------:|:-------------:|:------------:| +| Llama-3.1-8B-Instruct | 1 | 14.99 | -0.22% | 1.89 | +153.97% | 0.25 | -40.00% | -3.47% | +| Llama-3.1-8B-Instruct | 2 | 7.51 | -0.42% | 1.89 | +153.97% | 2.07 | -71.01% | +2.76% | +| Llama-3.1-8B-Instruct | 4 | 3.77 | -0.81% | 1.89 | +153.97% | 2.13 | -71.83% | +4.48% | +| Qwen2.5-7B-Instruct | 1 | 14.25 | -0.45% | 2.21 | +153.39% | 0.24 | -37.50% | -4.21% | +| Qwen2.5-7B-Instruct | 2 | 7.12 | -0.38% | 2.21 | +153.39% | 2.06 | -70.87% | +2.61% | +| Qwen2.5-7B-Instruct | 4 | 3.55 | -0.10% | 2.21 | +153.39% | 2.13 | -71.83% | +4.62% | + +- **Weights scale correctly** with TP: error stays near 0% across TP=1–4. +- **Activation is TP-invariant** in both formula and vLLM: error stays flat. +- **Non-torch is under-estimated at TP≥2**: NCCL all-reduce buffers push actual to ~2.1 GiB/GPU but the constant is 0.60 GiB. + +### Sensitivity: Pipeline Parallelism (PP) + +Model: meta-llama/Llama-3.1-8B-Instruct, TP=1, len=8192 + +| PP | Actual weight (GiB) | Weight err | Actual activ (GiB) | Activation err | Actual non-torch (GiB) | Non-torch err | KV cache err | +|:--:|:-------------------:|:----------:|:------------------:|:--------------:|:---------------------:|:-------------:|:------------:| +| 1 | 14.99 | -0.22% | 1.89 | +153.97% | 0.25 | -40.00% | -3.47% | +| 2 | 7.51 | -0.42% | 1.10 | +336.36% | 0.07 | +114.29% | -0.85% | +| 4 | 4.26 | -12.22% | 1.05 | +357.14% | 0.07 | +114.29% | +1.59% | + +- **Activation drops with PP**: PP=1 → 1.89 GiB, PP=2 → 1.10 GiB, PP=4 → 1.05 GiB. The formula always predicts 4.80 GiB regardless of PP. +- **Weight error grows with PP**: layer imbalance across stages causes the formula (which assumes uniform distribution) to deviate at high PP. + +### Sensitivity: Context Length (max_model_len) + +| Model | max_model_len | Actual KV (GiB) | KV err | Actual tokens | Pred tokens | Token err | +|-------|:-------------:|:---------------:|:------:|:-------------:|:-----------:|:---------:| +| Llama-3.1-8B-Instruct | 2,048 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | +| Llama-3.1-8B-Instruct | 4,096 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | +| Llama-3.1-8B-Instruct | 8,192 | 58.11 | -3.47% | 476,000 | 459,509 | -3.46% | +| Llama-3.1-8B-Instruct | 16,384 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | +| Llama-3.1-8B-Instruct | 32,768 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | +| Qwen2.5-7B-Instruct | 2,048 | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | +| Qwen2.5-7B-Instruct | 4,096 | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | +| Qwen2.5-7B-Instruct | 8,192 | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | +| Qwen2.5-7B-Instruct | 16,384 | 58.53 | -4.21% | 1,095,968 | 1,049,789 | -4.21% | +| Qwen2.5-7B-Instruct | 32,768 | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | + +- **KV pool size (GiB) is independent of max_model_len**: both formula and vLLM agree. The pool is sized from available memory, not a pre-allocated token count. +- **Token count error is model-dependent but context-length-stable**: the per-token KV bytes formula has architecture-specific errors that are consistent across all context lengths. + +### Root Cause Analysis + +#### 1. Activation Constants Are Stale (Primary Error Source) + +The planner uses fixed constants per architecture calibrated against vLLM v0.16.0 or earlier. vLLM v0.17.0 reduced reported activation overhead by ~60% (see Part 2: Version Sensitivity), and this lower level persists through v0.19.0. + +| Architecture | Planner constant (GiB) | Observed v0.19.0 range (GiB) | Error range | +|-------------|:---------------------:|:----------------------------:|:-----------:| +| DeepseekV2 | 8.00 | 1.93 | +314.51% | +| Gemma2 | 5.50 | 3.62–3.66 | +50.27% to +51.93% | +| Gemma3* | 5.50 | 3.89–3.99 | +37.84% to +41.39% | +| Gemma | 5.50 | 3.63 | +51.52% | +| GptOss | 8.00 | 2.87 | +178.75% | +| Granite | 5.50 | 0.75–0.85 | +547.06% to +633.33% | +| KimiVL* | 8.00 | 2.85–2.92 | +173.97% to +180.70% | +| Llama | 4.80 | 0.77–1.97 | +143.65% to +523.38% | +| Llama4* | 8.00 | 3.19 | +150.78% | +| LlavaNext* | 2.50 | 0.79 | +216.46% | +| Mistral3* | 2.50 | 2.03–2.18 | +14.68% to +23.15% | +| Mixtral | 8.00 | 1.21 | +561.16% | +| Phi3 | 5.50 | 1.52 | +261.84% | +| Phi | 5.50 | 0.79 | +596.20% | +| Qwen2 | 5.60 | 2.21–2.29 | +144.54% to +153.39% | +| Qwen3 | 5.60 | 2.21 | +153.39% | +| Qwen2Moe | 8.00 | 2.47 | +223.89% | +| Qwen3Moe | 8.00 | 2.68 | +198.51% | + +Re-calibrating these constants against v0.19.0 measurements is the highest-value fix. See Part 2 for the version history showing when constants became stale. + +#### 2. Non-torch Constants Under-estimated for Multi-GPU + +| TP | PP | Constant used (GiB) | Observed mean (GiB) | Mean error | +|:--:|:--:|:-------------------:|:-------------------:|:----------:| +| 1 | 1 | 0.15 | 0.27 | -42.17% | +| 1 | 2 | 0.15 | 0.07 | +114.29% | +| 1 | 4 | 0.15 | 0.07 | +114.29% | +| 2 | 1 | 0.60 | 2.08 | -71.15% | +| 4 | 1 | 0.60 | 2.17 | -72.34% | + +TP≥2 requires NCCL all-reduce buffers (~2.1 GiB/GPU vs the 0.60 GiB constant). PP≥2 adds P2P send/receive buffers that the formula ignores entirely. + +#### 3. GPU Catalog vs Physical Memory + +Planner uses 80 GiB (catalog); H100 physical VRAM is 79.19 GiB. +Effect: KV pool over-predicted by ~0.77 GiB (76.00 vs 75.23 GiB at 0.95 utilization). + +#### 4. CUDA Graph Memory + +Observed pool sizes: 0.51–1.85 GiB (mean 1.04 GiB). vLLM allocates CUDA graphs after sizing the KV cache, so the reported KV pool already includes CUDA graph memory — no formula correction needed. + +--- + +## Part 2: vLLM Version Sensitivity + +**Goal**: Determine when the planner's activation constants became stale by measuring activation memory across vLLM releases. + +**Model**: `Qwen/Qwen3-14B` — tp=1, pp=1, dp=1, max_model_len=8192, dtype=auto, no quantization. +**Hardware**: H100-80GB, gpu_memory_utilization=0.95. + +### Results + +| vLLM version | Weight (GiB) | Activation (GiB) | Non-torch (GiB) | KV cache (GiB) | Max concurrency | +|:---:|:---:|:---:|:---:|:---:|:---:| +| v0.15.0 | 27.52 | 5.64 | 0.13 | 41.94 | 33.55 | +| v0.16.0 | 27.52 | 5.64 | 0.13 | 41.94 | 33.55 | +| **v0.17.0** | 27.52 | **2.23** | 0.13 | 45.34 | 36.27 | +| v0.18.0 | 27.52 | 2.23 | 0.25 | 45.23 | 36.18 | +| v0.19.0† | 27.52 | ~2.21 | 0.25 | ~45.2 | ~36.2 | + +†v0.19.0 values extrapolated from Qwen3-8B (same architecture, scaled); Qwen3-14B not directly measured at v0.19.0. + +**Key findings**: + +- **Activation dropped 60% between v0.16.0 and v0.17.0**: 5.64 GiB → 2.23 GiB. This is the point at which all planner constants became stale. +- **Weight memory is stable across all versions**: model parameters are version-independent. +- **KV cache increased ~3.4 GiB at v0.17.0+**: the freed activation overhead is reallocated to the KV pool, increasing max concurrency by ~8%. +- **Non-torch increased at v0.18.0**: 0.13 → 0.25 GiB; likely new runtime bookkeeping overhead. +- **The planner's Qwen3 constant (5.60 GiB) matches v0.16.0 exactly** — constants were last calibrated against v0.16.0 or earlier. + +### Implication for Constant Re-calibration + +The activation drop at v0.17.0 is not architecture-specific — it reflects a vLLM-wide change in how activation memory is measured or allocated. All architecture constants in the planner should be re-calibrated against a single vLLM version (recommend v0.19.0) and re-validated whenever vLLM is updated. + +| Metric | v0.16.0 | v0.17.0+ | Change | +|--------|:-------:|:--------:|:------:| +| Activation (Qwen3-14B) | 5.64 GiB | 2.23 GiB | −60% | +| KV cache (Qwen3-14B) | 41.94 GiB | ~45.3 GiB | +8% | +| Max concurrency (Qwen3-14B) | 33.55 | ~36.2 | +8% | + +--- + +## Part 3: Parameters Not Yet Modeled + +The following vLLM flags affect memory allocation but are not yet accepted as planner inputs. + +### `--kv-cache-dtype fp8` + +| Model | kv_cache_dtype | Actual KV (GiB) | KV GiB err | Actual tokens | Pred tokens | Token err | +|-------|:--------------:|:---------------:|:----------:|:-------------:|:-----------:|:---------:| +| Qwen2.5-7B-Instruct | auto | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | +| Qwen2.5-7B-Instruct | fp8 | 58.53 | -4.21% | 2,192,000 | 1,049,789 | -52.11% | +| Llama-3.1-8B-Instruct | auto | 42.80 | +31.06% | 175,296 | 459,509 | +162.13% | +| Llama-3.1-8B-Instruct | fp8 | 58.11 | -3.47% | 952,032 | 459,509 | -51.73% | + +**KV pool size (GiB) is unaffected** — fp8 halves per-token storage, not the pool. **Token count is ~2× too low** because the planner uses the model's native dtype (BF16 = 2 bytes/element) instead of fp8 (1 byte/element). Fix: accept `kv_cache_dtype`; when `fp8`, use 1 byte/token. + +### `--dtype` override + +| dtype | Actual weight (GiB) | Weight err | Actual KV (GiB) | KV err | +|-------|:-------------------:|:----------:|:---------------:|:------:| +| bfloat16 | 14.99 | -0.22% | 58.11 | -3.47% | +| float16 | 14.99 | -0.22% | 58.11 | -3.47% | +| float32 | 29.98 | -50.11% | 42.80 | +31.06% | + +`--dtype float32` doubles weight memory. The planner reads the HF config dtype (BF16) and has no visibility into the vLLM override → −50% weight error, +31% KV error. `--dtype float16` matches the HF config → near-zero error. Fix: accept `dtype` as input. + +### Runtime `--quantization fp8` + +| Model | Actual weight (GiB) | Weight err | Actual KV (GiB) | KV err | +|-------|:-------------------:|:----------:|:---------------:|:------:| +| Llama-3.1-8B-Instruct | 8.49 | +76.18% | 64.61 | -13.18% | + +Runtime `--quantization fp8` compresses weights on-the-fly. vLLM logs the post-compression size (~half of BF16). The planner finds no `quantization_config` in the HF repo and predicts full BF16 weights → ~+76% weight error. Fix: accept `quantization fp8`; apply 1 byte/param. + +### Recommendations + +| Priority | Action | Expected impact | +|:--------:|--------|:---------------:| +| High | **Re-calibrate activation constants** from v0.19.0 measurements | Removes largest single error source (2–7× over-estimate) | +| High | **`kv_cache_dtype`** — when `fp8`, use 1 byte/token for KV | Fixes ~2× token/concurrency error for fp8-KV runs | +| Medium | **`dtype`** — when `float32`, double bytes-per-param | Fixes −50% weight error for float32 runs | +| Medium | **`quantization fp8` (runtime)** — apply 1 byte/param | Fixes +76% weight error for runtime-fp8 runs | +| Medium | **Re-measure non-torch constants for TP≥2 and PP≥2** | +1–2 GiB KV accuracy for multi-GPU | +| Medium | **Scale activation constant by 1/PP** | Fixes growing activation error at high PP | +| Medium | **`find_possible_tp`: require vocab_size divisibility** — valid TP must divide both `num_attention_heads` and `vocab_size` (vLLM shards the embedding/LM-head across TP ranks). Evidence: Qwen3-14B has 40 heads (tp=5 valid) but vocab_size=151936 (151936 % 5 ≠ 0 → rejected by vLLM). Fix: return divisors of `gcd(num_attention_heads, vocab_size)`. | Prevents planner from suggesting TP values that vLLM will reject | +| Low | **Use physical GPU memory** (79.19 GiB) instead of catalog 80 GiB | +0.77 GiB KV accuracy | + +--- + +## Run Matrix — vLLM v0.19.0 / H100-80GB + +**57 successful runs, 7 failed runs.** + +Quantization abbreviations: `ct` = compressed-tensors, `gptq` = gptq_marlin, `fp8` = fp8 inline, `mxfp4` = mx-format fp4, `—` = none. + +Vision/multi-modal models in this sweep: `moonshotai/Kimi-VL-A3B-Instruct` (vision-language MoE), `ibm-granite/granite-vision-3.3-2b` (vision-language), `google/gemma-3-{4b,12b,27b}-it` (vision-language), `meta-llama/Llama-4-Scout-17B-16E-Instruct` (vision+text MoE). + +`Qwen/Qwen1.5-MoE-A2.7B` uses the Qwen2Moe architecture (14.3B total, 2.7B active). Its activation memory (2.47 GiB) is much lower than the generic MoE constant (8.0 GiB) used by the planner, similar to the pattern observed for Qwen3Moe, Mixtral, and Llama4. + +### Successful Runs + +| Model | TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight err | Activation err | Non-torch err | KV cache err | +|---|---|---|---|---|---|---|---|---|---|---|---| +| codellama/CodeLlama-7b-hf | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.1% | +523.4% | -40.0% | -5.1% | +| deepseek-ai/DeepSeek-V2-Lite-Chat | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.6% | +314.5% | -42.3% | -11.5% | +| google/gemma-2-27b-it | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.0% | +50.3% | -42.3% | -4.6% | +| google/gemma-2-2b-it | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.6% | +51.9% | -37.5% | -1.5% | +| google/gemma-2-9b-it | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.0% | +50.7% | -40.0% | -1.8% | +| google/gemma-3-12b-it | 1 | 1 | 1 | 8192 | bf16 | — | auto | -2.6% | +39.6% | -40.0% | -0.1% | +| google/gemma-3-27b-it | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.7% | +37.8% | -42.3% | -1.4% | +| google/gemma-3-4b-it | 1 | 1 | 1 | 8192 | bf16 | — | auto | -6.6% | +41.4% | -40.0% | -0.3% | +| google/gemma-7b | 1 | 1 | 1 | 8192 | bf16 | — | auto | +0.0% | +51.5% | -40.0% | -1.8% | +| ibm-granite/granite-3.1-2b-instruct | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.4% | +633.3% | -67.4% | -5.3% | +| ibm-granite/granite-3.1-8b-instruct | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.2% | +547.1% | -67.4% | -6.0% | +| ibm-granite/granite-3.3-8b-instruct | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.2% | +547.1% | -67.4% | -6.0% | +| ibm-granite/granite-vision-3.3-2b | 1 | 1 | 1 | 8192 | bf16 | — | auto | +0.0% | +216.5% | -40.0% | -1.2% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 8192 | bf16 | — | fp8 | -0.2% | +154.0% | -40.0% | -3.5% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.2% | +154.0% | -40.0% | -3.5% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 8192 | bf16 | fp8 | auto | +76.2% | +154.0% | -40.0% | -13.2% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 8192 | f16 | — | auto | -0.2% | +154.0% | -40.0% | -3.5% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 8192 | f32 | — | auto | -50.1% | +117.2% | -40.0% | +31.1% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 2048 | bf16 | — | auto | -0.2% | +154.0% | -40.0% | -3.5% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 4096 | bf16 | — | auto | -0.2% | +154.0% | -40.0% | -3.5% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 2 | 1 | 8192 | bf16 | — | auto | -0.4% | +336.4% | +114.3% | -0.9% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 4 | 1 | 8192 | bf16 | — | auto | -12.2% | +357.1% | +114.3% | +1.6% | +| meta-llama/Llama-3.1-8B-Instruct | 2 | 1 | 1 | 8192 | bf16 | — | auto | -0.4% | +154.0% | -71.0% | +2.8% | +| meta-llama/Llama-3.1-8B-Instruct | 4 | 1 | 1 | 8192 | bf16 | — | auto | -0.8% | +154.0% | -71.8% | +4.5% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 16384 | bf16 | — | auto | -0.2% | +154.0% | -40.0% | -3.5% | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 32768 | bf16 | — | auto | -0.2% | +154.0% | -40.0% | -3.5% | +| microsoft/phi-2 | 1 | 1 | 1 | 2048 | f16 | — | auto | -0.2% | +596.2% | -37.5% | -5.5% | +| microsoft/phi-4 | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.3% | +261.8% | -40.0% | -6.6% | +| mistralai/Mistral-Small-3.1-24B-Instruct-2503 | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.1% | +23.2% | -40.0% | +1.5% | +| mistralai/Mixtral-8x7B-Instruct-v0.1 | 2 | 1 | 1 | 8192 | bf16 | — | auto | -0.0% | +561.2% | -71.0% | -1.9% | +| moonshotai/Kimi-Dev-72B | 2 | 1 | 1 | 8192 | bf16 | — | auto | -0.2% | +144.5% | -71.3% | +61.8% | +| moonshotai/Kimi-Dev-72B | 4 | 1 | 1 | 8192 | bf16 | — | auto | -0.5% | +144.5% | -72.9% | +9.3% | +| moonshotai/Kimi-VL-A3B-Instruct | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.6% | +174.0% | -40.0% | -9.8% | +| moonshotai/Kimi-VL-A3B-Instruct | 2 | 1 | 1 | 8192 | bf16 | — | auto | -1.6% | +180.7% | -71.0% | +2.4% | +| openai/gpt-oss-20b | 2 | 1 | 1 | 8192 | bf16 | mxfp4 | auto | -8.6% | +178.7% | -71.0% | +2.8% | +| meta-llama/Llama-4-Scout-17B-16E-Instruct | 4 | 1 | 1 | 8192 | bf16 | — | auto | -4.8% | +150.8% | -72.4% | +36.2% | +| Qwen/Qwen2.5-72B-Instruct | 2 | 1 | 1 | 8192 | bf16 | — | auto | -0.1% | +144.5% | -71.3% | +60.8% | +| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 8192 | bf16 | — | fp8 | -0.5% | +153.4% | -37.5% | -4.2% | +| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.5% | +153.4% | -37.5% | -4.2% | +| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 16384 | bf16 | — | auto | -0.5% | +153.4% | -37.5% | -4.2% | +| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 32768 | bf16 | — | auto | -0.5% | +153.4% | -37.5% | -4.2% | +| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 2048 | bf16 | — | auto | -0.5% | +153.4% | -37.5% | -4.2% | +| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 4096 | bf16 | — | auto | -0.5% | +153.4% | -37.5% | -4.2% | +| Qwen/Qwen2.5-7B-Instruct | 2 | 1 | 1 | 8192 | bf16 | — | auto | -0.4% | +153.4% | -70.9% | +2.6% | +| Qwen/Qwen2.5-7B-Instruct | 4 | 1 | 1 | 8192 | bf16 | — | auto | -0.1% | +153.4% | -71.8% | +4.6% | +| Qwen/Qwen3-30B-A3B | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.0% | +198.5% | -44.4% | -28.8% | +| Qwen/Qwen1.5-MoE-A2.7B | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.0% | +223.9% | -40.0% | -10.1% | +| Qwen/Qwen3-8B | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.1% | +153.4% | -40.0% | -4.4% | +| RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic | 2 | 1 | 1 | 8192 | bf16 | ct | auto | -0.1% | +144.9% | -71.4% | +5.0% | +| RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic | 4 | 1 | 1 | 8192 | bf16 | ct | auto | -0.2% | +143.7% | -72.9% | +5.9% | +| redhatai/Llama-3.3-70B-Instruct-quantized.w8a8 | 2 | 1 | 1 | 8192 | bf16 | ct | auto | -0.1% | +144.9% | -71.6% | +5.0% | +| RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16 | 1 | 1 | 1 | 8192 | f16 | gptq | auto | -0.7% | +154.0% | -40.0% | -3.0% | +| RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8 | 1 | 1 | 1 | 8192 | f16 | ct | auto | -0.4% | +154.0% | -40.0% | -3.1% | +| RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8 | 1 | 1 | 1 | 8192 | bf16 | ct | auto | -0.2% | +14.7% | -42.3% | +1.2% | +| RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8 | 2 | 1 | 1 | 8192 | bf16 | ct | auto | -0.8% | +23.2% | -71.0% | +5.3% | +| RedHatAI/Qwen2.5-7B-Instruct-fp8-dynamic | 1 | 1 | 1 | 8192 | bf16 | ct | auto | -0.4% | +153.4% | -37.5% | -3.9% | +| RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8 | 1 | 1 | 1 | 8192 | bf16 | ct | auto | -0.4% | +153.4% | -40.0% | -3.9% | + +### Failed Runs + +| Model | TP | PP | DP | max_len | Notes | +|---|---|---|---|---|---| +| codellama/CodeLlama-34b-hf | 2 | 1 | 1 | 8192 | GPU contention at runtime | +| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 2 | 8192 | DP=2 | +| microsoft/phi-2 | 1 | 1 | 1 | 8192 | max_model_len=8192 > max_position_embeddings=2048; fixed with max_model_len=2048 | +| moonshotai/Kimi-Dev-72B | 4 | 1 | 1 | 8192 | second attempt; tp=2 succeeded | +| openai/gpt-oss-20b | 1 | 1 | 1 | 8192 | sampler warmup OOM (~786 MiB needed, <552 MiB free) | +| openai/gpt-oss-20b | 2 | 1 | 1 | 8192 | sampler warmup OOM at gmu=0.95; succeeded at gmu=0.90 | +| Qwen/Qwen3-14B | 5 | 1 | 1 | 8192 | tp=5 invalid (vocab not divisible by 5) | + +### Calibration Decisions + +_Document constant changes here: old value → new value, evidence._ + +--- + +## Appendix: Measurement Methodology + +### Log Extraction + +Metrics are extracted from vLLM startup logs by [`accuracy/scripts/parse_logs.py`](scripts/parse_logs.py). All patterns match the first occurrence of the line, which comes from the rank-0 worker (TP0 or the sole GPU). Each regex below is taken directly from `parse_logs.py`. + +#### Model / run config + +``` +Initializing a V1 LLM engine (v0.19.0) with config: model='...', dtype=..., + max_seq_len=..., tensor_parallel_size=..., pipeline_parallel_size=..., + data_parallel_size=..., quantization=..., kv_cache_dtype=... +``` + +Fields extracted: `model`, `dtype`, `max_model_len`, `tp`, `pp`, `dp`, `quantization`, `kv_cache_dtype`. + +#### Weight memory per GPU (`weight_memory_gib`) + +``` +Model loading took 14.99 GiB memory and 14.429680 seconds +``` + +Regex: `Model loading took ([0-9.]+) GiB memory` + +This is the per-GPU weight footprint as loaded by vLLM's model runner. + +#### Activation, non-torch, and total non-KV memory + +``` +Memory profiling takes 17.96 seconds. Total non KV cache memory: 17.12GiB; + torch peak memory increase: 1.89GiB; non-torch forward increase memory: 0.25GiB; + weights memory: 14.99GiB. +``` + +Regex: `Memory profiling takes [0-9.]+ seconds\. Total non KV cache memory: ([0-9.]+)GiB; torch peak memory increase: ([0-9.]+)GiB; non-torch forward increase memory: ([0-9.]+)GiB; weights memory: ([0-9.]+)GiB\.` + +Fields extracted: + +| CSV column | Log field | Description | +|---|---|---| +| `total_non_kv_cache_gib` | `Total non KV cache memory` | Sum of all non-KV usage per GPU | +| `activation_memory_gib` | `torch peak memory increase` | Peak PyTorch memory from warmup/CUDA graph profiling | +| `non_torch_forward_gib` | `non-torch forward increase memory` | CUDA runtime + NCCL overhead | +| `weights_memory_gib` | `weights memory` | Cross-check of model loading line; should match `weight_memory_gib` | + +#### KV cache memory per GPU (`kv_cache_memory_gib`) + +``` +Available KV cache memory: 58.11 GiB +``` + +Regex: `Available KV cache memory: ([0-9.]+) GiB` + +#### KV cache token count (`kv_cache_tokens`) + +``` +GPU KV cache size: 476,016 tokens +``` + +Regex: `GPU KV cache size: ([\d,]+) tokens` + +#### Max concurrency (`max_concurrency`) + +``` +Maximum concurrency for 8,192 tokens per request: 58.11x +``` + +Regex: `Maximum concurrency for ([\d,]+) tokens per request: ([0-9.]+)x` + +This is `kv_cache_tokens / max_model_len`. + +#### CUDA graph memory (`cuda_graph_actual_gib`) + +``` +CUDA graph pool memory: 0.91 GiB (actual), 0.84 GiB (estimated), difference: 0.07 GiB (7.7%). +``` + +Regex: `CUDA graph pool memory: ([0-9.]+) GiB \(actual\), ([0-9.]+) GiB \(estimated\)` + +#### KV cache blocks (`kv_cache_blocks`) + +``` +Engine 000: vllm cache_config_info with initialization after num_gpu_blocks is: 29751 +``` + +Regex: `num_gpu_blocks is: (\d+)` + +--- + +### Capacity Planner Functions Evaluated + +Predictions are generated by [`accuracy/scripts/predict_capacity.py`](scripts/predict_capacity.py), which imports from [`src/planner/capacity_planner.py`](../../src/planner/capacity_planner.py). + +| Metric | Log column | Planner function | File:line | +|---|---|---|---| +| Weight memory per GPU | `weight_memory_gib` | `per_gpu_model_memory_required()` → `model_memory_req()` | [`capacity_planner.py:832`](../../src/planner/capacity_planner.py#L832) | +| Activation memory | `activation_memory_gib` | `estimate_vllm_activation_memory()` | [`capacity_planner.py:380`](../../src/planner/capacity_planner.py#L380) | +| Non-torch memory | `non_torch_forward_gib` | `estimate_vllm_non_torch_memory()` | [`capacity_planner.py:351`](../../src/planner/capacity_planner.py#L351) | +| KV cache memory per GPU | `kv_cache_memory_gib` | `allocatable_kv_cache_memory()` ÷ (tp×pp×dp) | [`capacity_planner.py:855`](../../src/planner/capacity_planner.py#L855) | +| KV cache tokens | `kv_cache_tokens` | KV bytes ÷ `KVCacheDetail.per_token_memory_bytes` per GPU | [`capacity_planner.py:81`](../../src/planner/capacity_planner.py#L81) | +| Max concurrency | `max_concurrency` | KV bytes ÷ (per-token bytes × max_model_len) | [`capacity_planner.py:81`](../../src/planner/capacity_planner.py#L81) | +| CUDA graph memory | `cuda_graph_actual_gib` | `estimate_vllm_cuda_graph_memory()` → returns 0.0 | [`capacity_planner.py:366`](../../src/planner/capacity_planner.py#L366) | + +**Weight memory** is computed by parsing the model's safetensors index to count parameter bytes per dtype. For quantized models, the quant method is detected from `quantization_config` in the HF config and the appropriate bytes-per-param are applied. See `model_memory_req()` at [`capacity_planner.py:553`](../../src/planner/capacity_planner.py#L553). + +**Activation memory** uses a tiered lookup: first checks `VALIDATED_ACTIVATION_PROFILES` (a dict keyed by `architectures[0]`), then falls back to `ACTIVATION_MEMORY_BASE_MOE_GIB` (8.0 GiB) for MoE models, `ACTIVATION_MEMORY_BASE_MULTIMODAL_GIB` (2.5 GiB) for multimodal, and `ACTIVATION_MEMORY_BASE_DENSE_GIB` (5.5 GiB) for all others. See [`capacity_planner.py:44`](../../src/planner/capacity_planner.py#L44) for the constants and [`capacity_planner.py:424`](../../src/planner/capacity_planner.py#L424) for the lookup logic. + +**KV cache memory** uses the formula: +``` +available = gpu_memory_gib × gpu_util × (tp × pp × dp) + − model_weights × dp + − activation × dp + − cuda_graph (0.0) + − non_torch +per_gpu_kv = available / (tp × pp × dp) +``` +This uses the catalog GPU memory (80 GiB), not the physical VRAM (79.19 GiB); the gap accounts for most of the baseline KV under-estimate. + +**Per-token KV bytes** are derived from `KVCacheDetail` at [`capacity_planner.py:81`](../../src/planner/capacity_planner.py#L81): +``` +per_token_bytes = 2 × num_kv_heads × head_dim × num_layers × dtype_bytes +``` +For MLA (DeepSeek models), the formula uses `kv_lora_rank` and `qk_rope_head_dim` instead. TP sharding: each GPU holds `per_token_bytes / (tp × pp)` per token. diff --git a/accuracy/results/v0.19.0/accuracy_report.md b/accuracy/results/v0.19.0/accuracy_report.md deleted file mode 100644 index fa010780..00000000 --- a/accuracy/results/v0.19.0/accuracy_report.md +++ /dev/null @@ -1,200 +0,0 @@ -# Capacity Planner Accuracy Report — vLLM v0.19.0 / H100-80GB - -**Hardware**: H100-80GB (catalog 80 GiB, physical 79.19 GiB) -**Planner inputs evaluated**: model, tp, pp, dp, max_model_len, gpu_memory_utilization - -> Percent error = (predicted − actual) / actual × 100. Positive = over-estimate, negative = under-estimate. - -## Part 1: Accuracy Evaluation - -Covers 51 runs across 32 models using only parameters the planner currently accepts as inputs. Excludes runs with `--dtype float32`, runtime `--quantization fp8`, and `--kv-cache-dtype fp8` (see Part 2). - -### Summary - -| Metric | Mean error | Mean abs error | n | -|--------|:----------:|:--------------:|:-:| -| KV cache memory (all runs) | +0.34% | +6.62% | 51 | -| KV cache memory (baseline: tp=pp=dp=1, len=8192, no quant) | -5.12% | — | 20 | -| Weight memory | -0.89% | +0.89% | 51 | -| Activation memory | +195.12% | +195.12% | 51 | -| Non-torch overhead | -44.08% | — | 51 | -| Max concurrency | +3.34% | +15.34% | 51 | - -**Key findings**: - -- **Weight memory is accurate**: mean abs error +0.89%, computed directly from safetensors parameter counts. -- **KV cache memory is close**: +0.34% mean error across all runs; -5.12% at baseline. Errors are small and consistent. -- **Activation is the dominant error source**: mean +195.12% (over-estimate). The planner uses empirical constants measured against an older vLLM version; v0.19.0 reports substantially lower values. See Root Cause Analysis. -- **Max concurrency tracks KV accuracy**: +3.34% mean error; deviations come from the per-token KV formula, not the pool size prediction. - -### Per-Model Results — Baseline (TP=1, PP=1, DP=1, len=8192, no quantization) - -| Model | Arch | Weight err | Activation err | Non-torch err | KV cache err | Max conc err | -|-------|------|:----------:|:--------------:|:-------------:|:------------:|:------------:| -| Qwen2.5-7B-Instruct | Qwen2 | -0.45% | +153.39% | -37.50% | -4.21% | -4.22% | -| Qwen3-30B-A3B | Qwen3Moe | -0.02% | +198.51% | -44.44% | -28.75% | -28.72% | -| Qwen3-8B | Qwen3 | -0.09% | +153.39% | -40.00% | -4.36% | -4.36% | -| CodeLlama-7b-hf | Llama | -0.07% | +523.38% | -40.00% | -5.13% | -5.13% | -| DeepSeek-V2-Lite-Chat | DeepseekV2 | -0.59% | +314.51% | -42.31% | -11.50% | -11.50% | -| gemma-2-27b-it | Gemma2 | -0.01% | +50.27% | -42.31% | -4.64% | -4.61% | -| gemma-2-2b-it | Gemma2 | -0.62% | +51.93% | -37.50% | -1.49% | -1.39% | -| gemma-2-9b-it | Gemma2 | -0.03% | +50.68% | -40.00% | -1.82% | -1.75% | -| gemma-3-12b-it | Gemma3* | -2.61% | +39.59% | -40.00% | -0.15% | +0.00% | -| gemma-3-27b-it | Gemma3* | -0.69% | +37.84% | -42.31% | -1.42% | +11.43% | -| gemma-3-4b-it | Gemma3* | -6.65% | +41.39% | -40.00% | -0.27% | +2.84% | -| gemma-7b | Gemma | -0.05% | +51.52% | -40.00% | -1.79% | -1.77% | -| granite-3.1-2b-instruct | Granite | -0.44% | +633.33% | -67.39% | -5.27% | -5.27% | -| granite-3.1-8b-instruct | Granite | -0.20% | +547.06% | -67.39% | -6.02% | -6.03% | -| granite-3.3-8b-instruct | Granite | -0.20% | +547.06% | -67.39% | -6.02% | -6.03% | -| granite-vision-3.3-2b | LlavaNext* | +0.04% | +216.46% | -40.00% | -1.23% | -1.23% | -| Llama-3.1-8B-Instruct | Llama | -0.22% | +153.97% | -40.00% | -3.47% | -3.48% | -| phi-4 | Phi3 | -0.31% | +261.84% | -40.00% | -6.59% | -6.58% | -| Mistral-Small-3.1-24B-Instruct-2503 | Mistral3* | -0.08% | +23.15% | -40.00% | +1.54% | +1.55% | -| Kimi-VL-A3B-Instruct | KimiVL* | -0.58% | +173.97% | -40.00% | -9.76% | -87.31% | - -### Sensitivity: Tensor Parallelism (TP) - -| Model | TP | Actual weight (GiB) | Weight err | Actual activ (GiB) | Activation err | Actual non-torch (GiB) | Non-torch err | KV cache err | -|-------|:--:|:-------------------:|:----------:|:------------------:|:--------------:|:---------------------:|:-------------:|:------------:| -| Llama-3.1-8B-Instruct | 1 | 14.99 | -0.22% | 1.89 | +153.97% | 0.25 | -40.00% | -3.47% | -| Llama-3.1-8B-Instruct | 2 | 7.51 | -0.42% | 1.89 | +153.97% | 2.07 | -71.01% | +2.76% | -| Llama-3.1-8B-Instruct | 4 | 3.77 | -0.81% | 1.89 | +153.97% | 2.13 | -71.83% | +4.48% | -| Qwen2.5-7B-Instruct | 1 | 14.25 | -0.45% | 2.21 | +153.39% | 0.24 | -37.50% | -4.21% | -| Qwen2.5-7B-Instruct | 2 | 7.12 | -0.38% | 2.21 | +153.39% | 2.06 | -70.87% | +2.61% | -| Qwen2.5-7B-Instruct | 4 | 3.55 | -0.10% | 2.21 | +153.39% | 2.13 | -71.83% | +4.62% | - -- **Weights scale correctly** with TP: error stays near 0% across TP=1–4. -- **Activation is TP-invariant** in both formula and vLLM: error stays flat. -- **Non-torch is under-estimated at TP≥2**: NCCL all-reduce buffers push actual to ~2.1 GiB/GPU but the constant is 0.60 GiB. The opposing over-estimate in activation partially masks this in the KV cache error. - -### Sensitivity: Pipeline Parallelism (PP) - -Model: meta-llama/Llama-3.1-8B-Instruct, TP=1, len=8192 - -| PP | Actual weight (GiB) | Weight err | Actual activ (GiB) | Activation err | Actual non-torch (GiB) | Non-torch err | KV cache err | -|:--:|:-------------------:|:----------:|:------------------:|:--------------:|:---------------------:|:-------------:|:------------:| -| 1 | 14.99 | -0.22% | 1.89 | +153.97% | 0.25 | -40.00% | -3.47% | -| 2 | 7.51 | -0.42% | 1.10 | +336.36% | 0.07 | +114.29% | -0.85% | -| 4 | 4.26 | -12.22% | 1.05 | +357.14% | 0.07 | +114.29% | +1.59% | - -- **Activation drops with PP**: PP=1 → 1.89 GiB, PP=2 → 1.10 GiB, PP=4 → 1.05 GiB. The formula always predicts 4.80 GiB regardless of PP. -- **Weight error grows with PP**: layer imbalance across stages causes the formula (which assumes uniform distribution) to deviate at high PP. - -### Sensitivity: Context Length (max_model_len) - -| Model | max_model_len | Actual KV (GiB) | KV err | Actual tokens | Pred tokens | Token err | -|-------|:-------------:|:---------------:|:------:|:-------------:|:-----------:|:---------:| -| Llama-3.1-8B-Instruct | 2,048 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | -| Llama-3.1-8B-Instruct | 4,096 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | -| Llama-3.1-8B-Instruct | 8,192 | 58.11 | -3.47% | 476,000 | 459,509 | -3.46% | -| Llama-3.1-8B-Instruct | 16,384 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | -| Llama-3.1-8B-Instruct | 32,768 | 58.11 | -3.47% | 476,016 | 459,509 | -3.47% | -| Qwen2.5-7B-Instruct | 2,048 | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | -| Qwen2.5-7B-Instruct | 4,096 | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | -| Qwen2.5-7B-Instruct | 8,192 | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | -| Qwen2.5-7B-Instruct | 16,384 | 58.53 | -4.21% | 1,095,968 | 1,049,789 | -4.21% | -| Qwen2.5-7B-Instruct | 32,768 | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | - -- **KV pool size (GiB) is independent of max_model_len**: both formula and vLLM agree. The pool is sized from available memory, not from a pre-allocated token count. -- **Token count predictions vary**: the per-token KV bytes formula has model-dependent errors that show up consistently across all context lengths. - -### Root Cause Analysis - -#### 1. Activation Constants Are Stale - -The planner uses fixed constants per architecture (e.g., 4.8 GiB for Llama) calibrated against an older vLLM version. vLLM v0.19.0 reports substantially lower values: - -| Architecture | Planner constant (GiB) | Observed v0.19.0 range (GiB) | Error range | -|-------------|:---------------------:|:----------------------------:|:-----------:| -| DeepseekV2 | 8.00 | 1.93–1.93 | +314.51% to +314.51% | -| Gemma2 | 5.50 | 3.62–3.66 | +50.27% to +51.93% | -| Gemma3* | 5.50 | 3.89–3.99 | +37.84% to +41.39% | -| Gemma | 5.50 | 3.63–3.63 | +51.52% to +51.52% | -| GptOss | 8.00 | 2.87–2.87 | +178.75% to +178.75% | -| Granite | 5.50 | 0.75–0.85 | +547.06% to +633.33% | -| KimiVL* | 8.00 | 2.85–2.92 | +173.97% to +180.70% | -| Llama | 4.80 | 0.77–1.97 | +143.65% to +523.38% | -| LlavaNext* | 2.50 | 0.79–0.79 | +216.46% to +216.46% | -| Mistral3* | 2.50 | 2.03–2.18 | +14.68% to +23.15% | -| Mixtral | 8.00 | 1.21–1.21 | +561.16% to +561.16% | -| Phi3 | 5.50 | 1.52–1.52 | +261.84% to +261.84% | -| Phi | 5.50 | 0.79–0.79 | +596.20% to +596.20% | -| Qwen2 | 5.60 | 2.21–2.29 | +144.54% to +153.39% | -| Qwen3 | 5.60 | 2.21–2.21 | +153.39% to +153.39% | -| Qwen3Moe | 8.00 | 2.68–2.68 | +198.51% to +198.51% | - -Re-calibrating these constants from the v0.19.0 measurements is the highest-value fix. - -#### 2. Non-torch Constants Under-estimated for Multi-GPU - -| TP | PP | Constant used (GiB) | Observed mean (GiB) | Mean error | -|:--:|:--:|:-------------------:|:-------------------:|:----------:| -| 1 | 1 | 0.15 | 0.27 | -42.17% | -| 1 | 2 | 0.15 | 0.07 | +114.29% | -| 1 | 4 | 0.15 | 0.07 | +114.29% | -| 2 | 1 | 0.6 | 2.08 | -71.15% | -| 4 | 1 | 0.6 | 2.17 | -72.34% | - -TP≥2 requires NCCL all-reduce buffers (~2.1 GiB/GPU vs the 0.60 GiB constant). PP≥2 adds P2P send/receive buffers that the formula ignores entirely. - -#### 3. GPU Catalog vs Physical Memory - -Planner uses 80 GiB (catalog); H100 physical VRAM is 79.19 GiB. -Effect: KV pool over-predicted by ~0.77 GiB (76.00 vs 75.23 GiB at 0.95 utilization). - -#### 4. CUDA Graph Memory - -Observed pool sizes: 0.51–1.85 GiB (mean 1.04 GiB). vLLM allocates CUDA graphs after sizing the KV cache, so the reported KV pool already includes CUDA graph memory — no formula correction needed. - ---- - -## Part 2: Next Steps — Parameters Not Yet Modeled - -The following vLLM flags affect memory allocation but are not yet accepted as planner inputs. Each subsection quantifies the prediction gap to inform which inputs to add next. - -### `--kv-cache-dtype fp8` - -| Model | kv_cache_dtype | Actual KV (GiB) | KV GiB err | Actual tokens | Pred tokens | Token err | -|-------|:--------------:|:---------------:|:----------:|:-------------:|:-----------:|:---------:| -| Qwen2.5-7B-Instruct | auto | 58.53 | -4.21% | 1,096,000 | 1,049,789 | -4.22% | -| Qwen2.5-7B-Instruct | fp8 | 58.53 | -4.21% | 2,192,000 | 1,049,789 | -52.11% | -|||||||| -| Llama-3.1-8B-Instruct | auto | 42.80 | +31.06% | 175,296 | 459,509 | +162.13% | -| Llama-3.1-8B-Instruct | fp8 | 58.11 | -3.47% | 952,032 | 459,509 | -51.73% | -|||||||| - -**KV pool size (GiB) is unaffected** — fp8 halves per-token storage, not the pool. The planner's GiB prediction stays accurate. **Token count is ~2× too low** because the planner always uses the model's native dtype (BF16 = 2 bytes/element) instead of fp8 (1 byte/element). Fix: accept `kv_cache_dtype` as input; when `fp8`, use 1 byte/token. - -### `--dtype` override - -| dtype | Actual weight (GiB) | Weight err | Actual KV (GiB) | KV err | -|-------|:-------------------:|:----------:|:---------------:|:------:| -| bfloat16 | 14.99 | -0.22% | 58.11 | -3.47% | -| bfloat16 | 14.99 | -0.22% | 58.11 | -3.47% | -| float16 | 14.99 | -0.22% | 58.11 | -3.47% | -| float32 | 29.98 | -50.11% | 42.80 | +31.06% | - -**`--dtype float32`** doubles weight memory. The planner reads the HF config dtype (BF16) and has no visibility into the vLLM override → −50% weight error, +31% KV error. -**`--dtype float16`** matches the HF config for these models → near-zero error. -Fix: accept `dtype` as input and use it to override the bytes-per-param calculation. - -### Runtime `--quantization fp8` - -| Model | Actual weight (GiB) | Weight err | Actual KV (GiB) | KV err | -|-------|:-------------------:|:----------:|:---------------:|:------:| -| Llama-3.1-8B-Instruct | 8.49 | +76.18% | 64.61 | -13.18% | - -Runtime `--quantization fp8` compresses weights on-the-fly after loading. vLLM logs the post-compression size (~half of BF16). The planner finds no `quantization_config` in the HF repo and predicts the full BF16 weight → ~+76% weight error. -Fix: accept `quantization fp8` as input; apply 1 byte/param for weight estimation. - -### Recommendations - -| Priority | Input to add | Expected impact | -|:--------:|-------------|:---------------:| -| High | **Re-calibrate activation constants** from v0.19.0 measurements. Current constants are 2–7× too high. | Removes largest single error source | -| High | **`kv_cache_dtype`** — when `fp8`, use 1 byte/token for KV. | Fixes ~2× token/concurrency error for fp8-KV runs | -| Medium | **`dtype`** — when `float32`, double bytes-per-param. | Fixes −50% weight error for float32 runs | -| Medium | **`quantization fp8` (runtime)** — apply 1 byte/param. | Fixes +76% weight error for runtime-fp8 runs | -| Medium | **Re-measure non-torch constants for TP≥2 and PP≥2.** | +1–2 GiB KV accuracy for multi-GPU | -| Medium | **Scale activation constant by 1/PP.** | Fixes growing activation error at high PP | -| Low | **Use physical GPU memory** (79.19 GiB) instead of catalog 80 GiB. | +0.77 GiB KV accuracy | \ No newline at end of file diff --git a/accuracy/results/v0.19.0/results_predicted.csv b/accuracy/results/v0.19.0/results_predicted.csv index 2b0da6b8..892010ad 100644 --- a/accuracy/results/v0.19.0/results_predicted.csv +++ b/accuracy/results/v0.19.0/results_predicted.csv @@ -15,12 +15,14 @@ meta-llama/Llama-3.1-8B-Instruct,H100-80GB,1,1,1,8192,torch.float16,None,auto,0. meta-llama/Llama-3.1-8B-Instruct,H100-80GB,1,1,1,8192,torch.float32,None,auto,0.95,LlamaForCausalLM,Grouped-query attention,32,8,128,2,131072,131072.0,14.9575,4.8,0.15,0.0,19.9075,56.0925,459509,56.09,14.9575,56.0925 meta-llama/Llama-3.1-8B-Instruct,H100-80GB,1,1,1,2048,torch.bfloat16,None,auto,0.95,LlamaForCausalLM,Grouped-query attention,32,8,128,2,131072,131072.0,14.9575,4.8,0.15,0.0,19.9075,56.0925,459509,224.37,14.9575,56.0925 meta-llama/Llama-3.1-8B-Instruct,H100-80GB,1,1,1,4096,torch.bfloat16,None,auto,0.95,LlamaForCausalLM,Grouped-query attention,32,8,128,2,131072,131072.0,14.9575,4.8,0.15,0.0,19.9075,56.0925,459509,112.18,14.9575,56.0925 +meta-llama/Llama-3.1-8B-Instruct,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,LlamaForCausalLM,Grouped-query attention,32,8,128,2,131072,131072.0,14.9575,4.8,0.15,0.0,19.9075,56.0925,459509,56.09,14.9575,56.0925 meta-llama/Llama-3.1-8B-Instruct,H100-80GB,1,2,1,8192,torch.bfloat16,None,auto,0.95,LlamaForCausalLM,Grouped-query attention,32,8,128,2,131072,65536.0,7.4788,4.8,0.15,0.0,12.4288,65.9712,1080872,131.94,14.9575,131.9425 meta-llama/Llama-3.1-8B-Instruct,H100-80GB,1,4,1,8192,torch.bfloat16,None,auto,0.95,LlamaForCausalLM,Grouped-query attention,32,8,128,2,131072,32768.0,3.7394,4.8,0.15,0.0,8.6894,70.9106,2323599,283.64,14.9575,283.6425 meta-llama/Llama-3.1-8B-Instruct,H100-80GB,2,1,1,8192,torch.bfloat16,None,auto,0.95,LlamaForCausalLM,Grouped-query attention,32,8,128,2,131072,65536.0,7.4788,4.8,0.6,0.0,12.8788,65.5212,1073499,131.04,14.9575,131.0425 meta-llama/Llama-3.1-8B-Instruct,H100-80GB,4,1,1,8192,torch.bfloat16,None,auto,0.95,LlamaForCausalLM,Grouped-query attention,32,8,128,2,131072,32768.0,3.7394,4.8,0.6,0.0,9.1394,70.4606,2308853,281.84,14.9575,281.8425 meta-llama/Llama-3.1-8B-Instruct,H100-80GB,1,1,1,16384,torch.bfloat16,None,auto,0.95,LlamaForCausalLM,Grouped-query attention,32,8,128,2,131072,131072.0,14.9575,4.8,0.15,0.0,19.9075,56.0925,459509,28.05,14.9575,56.0925 meta-llama/Llama-3.1-8B-Instruct,H100-80GB,1,1,1,32768,torch.bfloat16,None,auto,0.95,LlamaForCausalLM,Grouped-query attention,32,8,128,2,131072,131072.0,14.9575,4.8,0.15,0.0,19.9075,56.0925,459509,14.02,14.9575,56.0925 +meta-llama/Llama-4-Scout-17B-16E-Instruct,H100-80GB,4,1,1,8192,torch.bfloat16,None,auto,0.95,Llama4ForConditionalGeneration,Grouped-query attention,48,8,128,2,196608,49152.0,50.5903,8.0,0.6,0.0,59.1903,22.8097,498285,60.83,202.3611,91.2389 microsoft/phi-2,H100-80GB,1,1,1,2048,torch.float16,None,auto,0.95,PhiForCausalLM,Multi-head attention,32,32,80,2,327680,327680.0,5.1776,5.5,0.15,0.0,10.8276,65.1724,213557,104.28,5.1776,65.1724 microsoft/phi-4,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,Phi3ForCausalLM,Grouped-query attention,40,10,128,2,204800,204800.0,27.3055,5.5,0.15,0.0,32.9555,43.0445,225677,27.55,27.3055,43.0445 mistralai/Mistral-Small-3.1-24B-Instruct-2503,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,Mistral3ForConditionalGeneration,Grouped-query attention,40,8,128,2,163840,163840.0,44.7246,2.5,0.15,0.0,47.3746,28.6254,187599,22.9,44.7246,28.6254 @@ -30,27 +32,42 @@ moonshotai/Kimi-Dev-72B,H100-80GB,4,1,1,8192,torch.bfloat16,None,auto,0.95,Qwen2 moonshotai/Kimi-VL-A3B-Instruct,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,KimiVLForConditionalGeneration,Multi-head attention,27,16,128,2,221184,221184.0,30.5616,8.0,0.15,0.0,38.7116,37.2884,181017,22.1,30.5616,37.2884 moonshotai/Kimi-VL-A3B-Instruct,H100-80GB,2,1,1,8192,torch.bfloat16,None,auto,0.95,KimiVLForConditionalGeneration,Multi-head attention,27,16,128,2,221184,110592.0,15.2808,8.0,0.6,0.0,23.8808,56.1192,544863,66.51,30.5616,112.2384 openai/gpt-oss-20b,H100-80GB,2,1,1,8192,torch.bfloat16,mxfp4,auto,0.9,GptOssForCausalLM,Grouped-query attention,24,8,64,0.53125,13056,6528.0,6.4081,8.0,0.6,0.0,15.0081,60.9919,10032102,1224.62,12.8162,121.9838 +Qwen/Qwen1.5-MoE-A2.7B,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,Qwen2MoeForCausalLM,Multi-head attention,24,16,128,2,196608,196608.0,26.6652,8.0,0.15,0.0,34.8152,41.1848,224923,27.46,26.6652,41.1848 Qwen/Qwen2.5-7B-Instruct,H100-80GB,1,1,1,8192,torch.bfloat16,None,fp8,0.95,Qwen2ForCausalLM,Grouped-query attention,28,4,128,2,57344,57344.0,14.1852,5.6,0.15,0.0,19.9352,56.0648,1049789,128.15,14.1852,56.0648 Qwen/Qwen2.5-72B-Instruct,H100-80GB,2,1,1,8192,torch.bfloat16,None,auto,0.95,Qwen2ForCausalLM,Grouped-query attention,80,8,128,2,327680,163840.0,67.7129,5.6,0.6,0.0,73.9129,4.8871,32027,3.91,135.4259,9.7741 Qwen/Qwen2.5-7B-Instruct,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,Qwen2ForCausalLM,Grouped-query attention,28,4,128,2,57344,57344.0,14.1852,5.6,0.15,0.0,19.9352,56.0648,1049789,128.15,14.1852,56.0648 -Qwen/Qwen2.5-7B-Instruct,H100-80GB,1,1,1,16384,torch.bfloat16,None,auto,0.95,Qwen2ForCausalLM,Grouped-query attention,28,4,128,2,57344,57344.0,14.1852,5.6,0.15,0.0,19.9352,56.0648,1049789,64.07,14.1852,56.0648 -Qwen/Qwen2.5-7B-Instruct,H100-80GB,1,1,1,32768,torch.bfloat16,None,auto,0.95,Qwen2ForCausalLM,Grouped-query attention,28,4,128,2,57344,57344.0,14.1852,5.6,0.15,0.0,19.9352,56.0648,1049789,32.04,14.1852,56.0648 -Qwen/Qwen2.5-7B-Instruct,H100-80GB,1,1,1,2048,torch.bfloat16,None,auto,0.95,Qwen2ForCausalLM,Grouped-query attention,28,4,128,2,57344,57344.0,14.1852,5.6,0.15,0.0,19.9352,56.0648,1049789,512.59,14.1852,56.0648 -Qwen/Qwen2.5-7B-Instruct,H100-80GB,1,1,1,4096,torch.bfloat16,None,auto,0.95,Qwen2ForCausalLM,Grouped-query attention,28,4,128,2,57344,57344.0,14.1852,5.6,0.15,0.0,19.9352,56.0648,1049789,256.3,14.1852,56.0648 -Qwen/Qwen2.5-7B-Instruct,H100-80GB,2,1,1,8192,torch.bfloat16,None,auto,0.95,Qwen2ForCausalLM,Grouped-query attention,28,4,128,2,57344,28672.0,7.0926,5.6,0.6,0.0,13.2926,65.5074,2453196,299.46,14.1852,131.0148 -Qwen/Qwen2.5-7B-Instruct,H100-80GB,4,1,1,8192,torch.bfloat16,None,auto,0.95,Qwen2ForCausalLM,Grouped-query attention,28,4,128,2,57344,14336.0,3.5463,5.6,0.6,0.0,9.7463,70.4537,5276861,644.15,14.1852,281.8148 -Qwen/Qwen3-30B-A3B,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,Qwen3MoeForCausalLM,Grouped-query attention,48,4,128,2,98304,98304.0,56.8705,8.0,0.15,0.0,65.0205,10.9795,119925,14.64,56.8705,10.9795 -Qwen/Qwen3-8B,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,Qwen3ForCausalLM,Grouped-query attention,36,8,128,2,147456,147456.0,15.2564,5.6,0.15,0.0,21.0064,54.9936,400450,48.88,15.2564,54.9936 -redhatai/Llama-3.3-70B-Instruct-quantized.w8a8,H100-80GB,2,1,1,8192,torch.bfloat16,compressed-tensors,auto,0.95,LlamaForCausalLM,Grouped-query attention,80,8,128,2,327680,163840.0,33.8395,4.8,0.6,0.0,39.2395,39.1605,256642,31.33,67.679,78.321 -RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16,H100-80GB,1,1,1,8192,torch.float16,gptq_marlin,auto,0.95,LlamaForCausalLM,Grouped-query attention,32,8,128,2,131072,131072.0,5.3417,4.8,0.15,0.0,10.2917,65.7083,538282,65.71,5.3417,65.7083 -RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8,H100-80GB,1,1,1,8192,torch.bfloat16,compressed-tensors,auto,0.95,Mistral3ForConditionalGeneration,Grouped-query attention,40,8,128,2,163840,163840.0,24.0276,2.5,0.15,0.0,26.6776,49.3224,323239,39.46,24.0276,49.3224 -RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8,H100-80GB,2,1,1,8192,torch.bfloat16,compressed-tensors,auto,0.95,Mistral3ForConditionalGeneration,Grouped-query attention,40,8,128,2,163840,81920.0,12.0138,2.5,0.6,0.0,15.1138,62.1362,814431,99.42,24.0276,124.2724 -RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8,H100-80GB,1,1,1,8192,torch.float16,compressed-tensors,auto,0.95,LlamaForCausalLM,Grouped-query attention,32,8,128,2,131072,131072.0,8.4601,4.8,0.15,0.0,13.4101,62.5899,512736,62.59,8.4601,62.5899 -RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8,H100-80GB,1,1,1,8192,torch.bfloat16,compressed-tensors,auto,0.95,Qwen2ForCausalLM,Grouped-query attention,28,4,128,2,57344,57344.0,8.1106,5.6,0.15,0.0,13.8606,62.1394,1163533,142.03,8.1106,62.1394 -google/gemma-2-27b-it,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,Gemma2ForCausalLM,Grouped-query attention,46,16,128,2,376832,376832.0,50.7145,5.5,0.15,0.0,56.3645,19.6355,55949,6.83,50.7145,19.6355 -google/gemma-2-2b-it,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,Gemma2ForCausalLM,Grouped-query attention,26,4,256,2,106496,106496.0,4.8696,5.5,0.15,0.0,10.5196,65.4804,660203,80.59,4.8696,65.4804 -google/gemma-2-9b-it,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,Gemma2ForCausalLM,Grouped-query attention,42,8,256,2,344064,344064.0,17.214,5.5,0.15,0.0,22.864,53.136,165824,20.24,17.214,53.136 -google/gemma-3-12b-it,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,Gemma3ForConditionalGeneration,Grouped-query attention,48,8,256,2,393216,393216.0,22.7007,5.5,0.15,0.0,28.3507,47.6493,130114,15.88,22.7007,47.6493 -google/gemma-3-27b-it,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,Gemma3ForConditionalGeneration,Grouped-query attention,62,16,128,2,507904,507904.0,51.0968,5.5,0.15,0.0,56.7468,19.2532,40702,4.97,51.0968,19.2532 -google/gemma-3-4b-it,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,Gemma3ForConditionalGeneration,Grouped-query attention,34,4,256,2,139264,139264.0,8.0095,5.5,0.15,0.0,13.6595,62.3405,480652,58.67,8.0095,62.3405 -google/gemma-7b,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,GemmaForCausalLM,Multi-head attention,28,16,256,2,458752,458752.0,15.9027,5.5,0.15,0.0,21.5527,54.4473,127437,15.56,15.9027,54.4473 +Qwen/Qwen2.5-7B-Instruct,H100-80GB,1,1,1,16384,torch.bfloat16,None,auto,0.95,Qwen2ForCausalLM,Grouped-query attention,28,4,128,2,57344,57344.0,14.1852,5.6,0.15,0.0,19.9352,56.0648,1049789,64.07,14.1852, ✓ RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8 tp=1 pp=1 len=8192 + weight=8.1106 GiB activ=5.6 GiB kv=62.1394 GiB + +Wrote 52 rows → /Users/jchen/go/src/llm-d/llm-d-planner/accuracy/results/v0.19.0/results_predicted.csv + +Failed (7): + google/gemma-2-27b-it tp=1 pp=1 len=8192: 403 Client Error. (Request ID: Root=1-69e97239-3207860237ea9eac1725278a;e40963de-31e8-4b64-8665-c644ef789e0a) + +Cannot access gated repo for url https://huggingface.co/google/gemma-2-27b-it/resolve/main/model.safetensors. +Access to model google/gemma-2-27b-it is restricted and you are not in the authorized list. Visit https://huggingface.co/google/gemma-2-27b-it to ask for access. + google/gemma-2-2b-it tp=1 pp=1 len=8192: 403 Client Error. (Request ID: Root=1-69e97239-1a7c512774f241ae6b85c193;b461f77c-56f0-4a1a-967b-4a98d3625953) + +Cannot access gated repo for url https://huggingface.co/google/gemma-2-2b-it/resolve/main/model.safetensors. +Access to model google/gemma-2-2b-it is restricted and you are not in the authorized list. Visit https://huggingface.co/google/gemma-2-2b-it to ask for access. + google/gemma-2-9b-it tp=1 pp=1 len=8192: 403 Client Error. (Request ID: Root=1-69e97239-16466b334103d30130e77f43;54176022-86d0-42cc-beea-4ebeab18ca1f) + +Cannot access gated repo for url https://huggingface.co/google/gemma-2-9b-it/resolve/main/model.safetensors. +Access to model google/gemma-2-9b-it is restricted and you are not in the authorized list. Visit https://huggingface.co/google/gemma-2-9b-it to ask for access. + google/gemma-3-12b-it tp=1 pp=1 len=8192: 403 Client Error. (Request ID: Root=1-69e97239-3fb465b314555eae7fa9f9a3;65b140f4-ef92-41e8-9ee4-84cf683f28ef) + +Cannot access gated repo for url https://huggingface.co/google/gemma-3-12b-it/resolve/main/model.safetensors. +Access to model google/gemma-3-12b-it is restricted and you are not in the authorized list. Visit https://huggingface.co/google/gemma-3-12b-it to ask for access. + google/gemma-3-27b-it tp=1 pp=1 len=8192: 403 Client Error. (Request ID: Root=1-69e97239-248be84100e899b0767be1c5;07443cda-0363-431b-afad-f77e1b178691) + +Cannot access gated repo for url https://huggingface.co/google/gemma-3-27b-it/resolve/main/model.safetensors. +Access to model google/gemma-3-27b-it is restricted and you are not in the authorized list. Visit https://huggingface.co/google/gemma-3-27b-it to ask for access. + google/gemma-3-4b-it tp=1 pp=1 len=8192: 403 Client Error. (Request ID: Root=1-69e9723a-7fb5b9a73ea2a01d669da464;b311f6fd-0e19-49db-80ac-b7766c577bb2) + +Cannot access gated repo for url https://huggingface.co/google/gemma-3-4b-it/resolve/main/model.safetensors. +Access to model google/gemma-3-4b-it is restricted and you are not in the authorized list. Visit https://huggingface.co/google/gemma-3-4b-it to ask for access. + google/gemma-7b tp=1 pp=1 len=8192: 403 Client Error. (Request ID: Root=1-69e9723a-020f3f6365aa4bbc6ac51b1f;828a363e-b75f-490d-baca-d7c6927996c1) + +Cannot access gated repo for url https://huggingface.co/google/gemma-7b/resolve/main/model.safetensors. +Access to model google/gemma-7b is restricted and you are not in the authorized list. Visit https://huggingface.co/google/gemma-7b to ask for access. +2.1394 diff --git a/accuracy/results/v0.19.0/results_raw.csv b/accuracy/results/v0.19.0/results_raw.csv index 7c0b7d90..5bdf59d9 100644 --- a/accuracy/results/v0.19.0/results_raw.csv +++ b/accuracy/results/v0.19.0/results_raw.csv @@ -17,6 +17,7 @@ google-gemma-3-27b-it--h100-80gb--tp1pp1dp1--8192.FAILED.log,failed,,H100-80GB,, google-gemma-3-27b-it--h100-80gb--tp1pp1dp1--8192.log,ok,google/gemma-3-27b-it,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,51.45,51.45,3.99,0.26,55.7,1.05,1.14,19.53,36560,15997,4.46,78.68,79.19,51,512,51 google-gemma-3-4b-it--h100-80gb--tp1pp1dp1--8192.FAILED.log,failed,,H100-80GB,,,,,,,,0.95,,,,,,,,,,,,,,,,,,,,, google-gemma-3-4b-it--h100-80gb--tp1pp1dp1--8192.log,ok,google/gemma-3-4b-it,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,8.58,8.58,3.89,0.25,12.72,0.7,0.67,62.51,468144,204817,57.05,78.68,79.19,51,512,51 +google-gemma-4-e4b-it--h100-80gb--tp1pp1dp1--8192.FAILED.log,failed,,H100-80GB,,,,,,,,0.95,,,,,,,,,,,,,,,,,,,,, google-gemma-7b--h100-80gb--tp1pp1dp1--8192.log,ok,google/gemma-7b,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,15.91,15.91,3.63,0.25,19.79,0.84,0.94,55.44,129760,8110,15.84,78.68,79.19,51,512,51 granite-3-1-2b--h100-80gb--tp1pp1dp1--8192.log,ok,ibm-granite/granite-3.1-2b-instruct,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,4.74,4.74,0.75,0.46,5.95,1.6,0.84,69.28,908048,56753,110.85,78.68,79.19,51,512,51 granite-3-1-8b--h100-80gb--tp1pp1dp1--8192.log,ok,ibm-granite/granite-3.1-8b-instruct,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,15.25,15.25,0.85,0.46,16.57,0.74,0.98,58.66,384432,24027,46.93,78.68,79.19,51,512,51 @@ -38,6 +39,7 @@ meta-llama-llama-3-1-8b---h100-80gb--tp4pp1dp1--8192.log,ok,meta-llama/Llama-3.1 meta-llama-llama-3-1-8b--h100-80gb--tp1pp1dp1--16384.log,ok,meta-llama/Llama-3.1-8B-Instruct,H100-80GB,1,1,1,16384,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,14.99,14.99,1.89,0.25,17.12,0.84,0.91,58.11,476016,29751,29.05,78.68,79.19,51,512,51 meta-llama-llama-3-1-8b--h100-80gb--tp1pp1dp1--32768.log,ok,meta-llama/Llama-3.1-8B-Instruct,H100-80GB,1,1,1,32768,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,14.99,14.99,1.89,0.25,17.12,0.84,0.91,58.11,476016,29751,14.53,78.68,79.19,51,512,51 meta-llama-llama-4-scout--h100-80gb--tp4pp1dp1--8192.FAILED.log,failed,meta-llama/Llama-4-Scout-17B-16E-Instruct,H100-80GB,4,1,1,8192,torch.bfloat16,None,auto,0.95,77.12,79.19,2.07,2.05,75.23,,,,,,,,,,,,,,,, +meta-llama-llama-4-scout--h100-80gb--tp4pp1dp1--8192.log,ok,meta-llama/Llama-4-Scout-17B-16E-Instruct,H100-80GB,4,1,1,8192,torch.bfloat16,None,auto,0.95,77.12,79.19,2.07,2.05,75.23,53.12,53.12,3.19,2.17,58.48,1.09,1.13,16.75,365968,91495,44.68,77.12,79.19,,, microsoft-phi-2--h100-80gb--tp1pp1dp1--2048.log,ok,microsoft/phi-2,H100-80GB,1,1,1,2048,torch.float16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,5.19,5.19,0.79,0.24,6.23,0.55,0.76,69.0,226112,14132,110.41,78.68,79.19,51,512,51 microsoft-phi-2--h100-80gb--tp1pp1dp1--8192.FAILED.log,failed,,H100-80GB,,,,,,,,0.95,,,,,,,,,,,,,,,,,,,,, microsoft-phi-4--h100-80gb--tp1pp1dp1--8192.log,ok,microsoft/phi-4,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,27.39,27.39,1.52,0.25,29.15,0.81,1.07,46.08,241568,15098,29.49,78.68,79.19,51,512,51 @@ -51,6 +53,7 @@ moonshotai-kimi-vl-a3b-i--h100-80gb--tp2pp1dp1--8192.log,ok,moonshotai/Kimi-VL-A openai-gpt-oss-20b--h100-80gb--tp1pp1dp1--8192.FAILED.log,failed,openai/gpt-oss-20b,H100-80GB,1,1,1,8192,torch.bfloat16,mxfp4,auto,0.95,78.68,79.19,0.51,0.51,75.23,13.64,13.64,2.87,0.25,16.76,1.3,1.84,58.47,1277184,,155.75,78.68,79.19,83,1024,83 openai-gpt-oss-20b--h100-80gb--tp2pp1dp1--8192.FAILED.log,failed,openai/gpt-oss-20b,H100-80GB,2,1,1,8192,torch.bfloat16,mxfp4,auto,0.95,77.18,79.19,2.01,1.99,75.23,7.01,7.01,2.87,2.07,11.95,1.09,1.55,63.28,2764768,,337.17,77.18,79.19,83,1024,83 openai-gpt-oss-20b--h100-80gb--tp2pp1dp1--8192.log,ok,openai/gpt-oss-20b,H100-80GB,2,1,1,8192,torch.bfloat16,mxfp4,auto,0.9,77.18,79.19,2.01,1.99,71.27,7.01,7.01,2.87,2.07,11.95,1.09,1.55,59.32,2591776,323972,316.07,77.18,79.19,83,1024,83 +qwen-qwen1-5-moe-a2-7b--h100-80gb--tp1pp1dp1--8192.log,ok,Qwen/Qwen1.5-MoE-A2.7B,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,26.67,26.67,2.47,0.25,29.4,0.82,1.1,45.83,250304,15644,30.55,78.68,79.19,51,512,51 qwen-qwen2---h100-80gb--tp1pp1dp1--8192-dtbf16-kvfp8.log,ok,Qwen/Qwen2.5-7B-Instruct,H100-80GB,1,1,1,8192,torch.bfloat16,None,fp8,0.95,78.68,79.19,0.51,0.51,75.23,14.25,14.25,2.21,0.24,16.7,0.61,0.86,58.53,2192000,137000,267.58,78.68,79.19,51,512,51 qwen-qwen2-5-72b-instruc--h100-80gb--tp2pp1dp1--8192.log,ok,Qwen/Qwen2.5-72B-Instruct,H100-80GB,2,1,1,8192,torch.bfloat16,None,auto,0.95,77.66,79.19,1.53,1.51,75.23,67.8,67.8,2.29,2.09,72.19,1.63,1.61,3.04,19920,1245,2.43,77.66,79.19,51,512,51 qwen-qwen2-5-7b-i--h100-80gb--tp1pp1dp1--8192-dtbf16.log,ok,Qwen/Qwen2.5-7B-Instruct,H100-80GB,1,1,1,8192,torch.bfloat16,None,auto,0.95,78.68,79.19,0.51,0.51,75.23,14.25,14.25,2.21,0.24,16.7,0.61,0.87,58.53,1096000,68500,133.79,78.68,79.19,51,512,51 diff --git a/accuracy/results/v0.19.0/run_matrix.md b/accuracy/results/v0.19.0/run_matrix.md deleted file mode 100644 index 21e96b2e..00000000 --- a/accuracy/results/v0.19.0/run_matrix.md +++ /dev/null @@ -1,82 +0,0 @@ -# Run Matrix — vLLM v0.19.0 / H100-80GB - -**55 successful runs, 8 failed runs.** - -Quantization abbreviations: `ct` = compressed-tensors, `gptq` = gptq_marlin, `fp8` = fp8 inline, `mxfp4` = mx-format fp4, `—` = none. - -## Successful Runs - -| Model | TP | PP | DP | max_len | dtype | quant | kv_dtype | Weight | Activation | Non-torch | KV cache | -|---|---|---|---|---|---|---|---|---|---|---|---| -| codellama/CodeLlama-7b-hf | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.1% | +523.4% | -40.0% | -5.1% | -| deepseek-ai/DeepSeek-V2-Lite-Chat | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.6% | +314.5% | -42.3% | -11.5% | -| google/gemma-2-27b-it | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.0% | +50.3% | -42.3% | -4.6% | -| google/gemma-2-2b-it | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.6% | +51.9% | -37.5% | -1.5% | -| google/gemma-2-9b-it | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.0% | +50.7% | -40.0% | -1.8% | -| google/gemma-3-12b-it | 1 | 1 | 1 | 8192 | bf16 | — | auto | -2.6% | +39.6% | -40.0% | -0.1% | -| google/gemma-3-27b-it | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.7% | +37.8% | -42.3% | -1.4% | -| google/gemma-3-4b-it | 1 | 1 | 1 | 8192 | bf16 | — | auto | -6.6% | +41.4% | -40.0% | -0.3% | -| google/gemma-7b | 1 | 1 | 1 | 8192 | bf16 | — | auto | +0.0% | -34.0% | +66.7% | +1.8% | -| ibm-granite/granite-3.1-2b-instruct | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.4% | +633.3% | -67.4% | -5.3% | -| ibm-granite/granite-3.1-8b-instruct | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.2% | +547.1% | -67.4% | -6.0% | -| ibm-granite/granite-3.3-8b-instruct | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.2% | +547.1% | -67.4% | -6.0% | -| ibm-granite/granite-vision-3.3-2b | 1 | 1 | 1 | 8192 | bf16 | — | auto | +0.0% | +216.5% | -40.0% | -1.2% | -| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 8192 | bf16 | — | fp8 | -0.2% | +154.0% | -40.0% | -3.5% | -| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.2% | +154.0% | -40.0% | -3.5% | -| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 8192 | bf16 | fp8 | auto | +76.2% | +154.0% | -40.0% | -13.2% | -| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 8192 | f16 | — | auto | -0.2% | +154.0% | -40.0% | -3.5% | -| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 8192 | f32 | — | auto | -50.1% | +117.2% | -40.0% | +31.1% | -| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 2048 | bf16 | — | auto | -0.2% | +154.0% | -40.0% | -3.5% | -| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 4096 | bf16 | — | auto | -0.2% | +154.0% | -40.0% | -3.5% | -| meta-llama/Llama-3.1-8B-Instruct | 1 | 2 | 1 | 8192 | bf16 | — | auto | -0.4% | +336.4% | +114.3% | -0.9% | -| meta-llama/Llama-3.1-8B-Instruct | 1 | 4 | 1 | 8192 | bf16 | — | auto | -12.2% | +357.1% | +114.3% | +1.6% | -| meta-llama/Llama-3.1-8B-Instruct | 2 | 1 | 1 | 8192 | bf16 | — | auto | -0.4% | +154.0% | -71.0% | +2.8% | -| meta-llama/Llama-3.1-8B-Instruct | 4 | 1 | 1 | 8192 | bf16 | — | auto | -0.8% | +154.0% | -71.8% | +4.5% | -| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 16384 | bf16 | — | auto | -0.2% | +154.0% | -40.0% | -3.5% | -| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 1 | 32768 | bf16 | — | auto | -0.2% | +154.0% | -40.0% | -3.5% | -| microsoft/phi-2 | 1 | 1 | 1 | 2048 | f16 | — | auto | +0.2% | -85.6% | +60.0% | +5.9% | -| microsoft/phi-4 | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.3% | +261.8% | -40.0% | -6.6% | -| mistralai/Mistral-Small-3.1-24B-Instruct-2503 | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.1% | +23.2% | -40.0% | +1.5% | -| mistralai/Mixtral-8x7B-Instruct-v0.1 | 2 | 1 | 1 | 8192 | bf16 | — | auto | -0.0% | +561.2% | -71.0% | -1.9% | -| moonshotai/Kimi-Dev-72B | 2 | 1 | 1 | 8192 | bf16 | — | auto | -0.2% | +144.5% | -71.3% | +61.8% | -| moonshotai/Kimi-Dev-72B | 4 | 1 | 1 | 8192 | bf16 | — | auto | -0.5% | +144.5% | -72.9% | +9.3% | -| moonshotai/Kimi-VL-A3B-Instruct | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.6% | +174.0% | -40.0% | -9.8% | -| moonshotai/Kimi-VL-A3B-Instruct | 2 | 1 | 1 | 8192 | bf16 | — | auto | -1.6% | +180.7% | -71.0% | +2.4% | -| openai/gpt-oss-20b | 2 | 1 | 1 | 8192 | bf16 | mxfp4 | auto | +9.4% | -64.1% | +245.0% | -2.7% | -| Qwen/Qwen2.5-72B-Instruct | 2 | 1 | 1 | 8192 | bf16 | — | auto | -0.1% | +144.5% | -71.3% | +60.8% | -| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 8192 | bf16 | — | fp8 | -0.5% | +153.4% | -37.5% | -4.2% | -| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.5% | +153.4% | -37.5% | -4.2% | -| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 16384 | bf16 | — | auto | -0.5% | +153.4% | -37.5% | -4.2% | -| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 32768 | bf16 | — | auto | -0.5% | +153.4% | -37.5% | -4.2% | -| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 2048 | bf16 | — | auto | -0.5% | +153.4% | -37.5% | -4.2% | -| Qwen/Qwen2.5-7B-Instruct | 1 | 1 | 1 | 4096 | bf16 | — | auto | -0.5% | +153.4% | -37.5% | -4.2% | -| Qwen/Qwen2.5-7B-Instruct | 2 | 1 | 1 | 8192 | bf16 | — | auto | -0.4% | +153.4% | -70.9% | +2.6% | -| Qwen/Qwen2.5-7B-Instruct | 4 | 1 | 1 | 8192 | bf16 | — | auto | -0.1% | +153.4% | -71.8% | +4.6% | -| Qwen/Qwen3-30B-A3B | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.0% | +198.5% | -44.4% | -28.8% | -| Qwen/Qwen3-8B | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.1% | +153.4% | -40.0% | -4.4% | -| RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic | 2 | 1 | 1 | 8192 | bf16 | ct | auto | -0.1% | +144.9% | -71.4% | +5.0% | -| RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic | 4 | 1 | 1 | 8192 | bf16 | ct | auto | -0.2% | +143.7% | -72.9% | +5.9% | -| redhatai/Llama-3.3-70B-Instruct-quantized.w8a8 | 2 | 1 | 1 | 8192 | bf16 | ct | auto | -0.1% | +144.9% | -71.6% | +5.0% | -| RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16 | 1 | 1 | 1 | 8192 | f16 | gptq | auto | -0.7% | +154.0% | -40.0% | -3.0% | -| RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8 | 1 | 1 | 1 | 8192 | f16 | ct | auto | -0.4% | +154.0% | -40.0% | -3.1% | -| RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8 | 1 | 1 | 1 | 8192 | bf16 | ct | auto | -0.2% | +14.7% | -42.3% | +1.2% | -| RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8 | 2 | 1 | 1 | 8192 | bf16 | ct | auto | -0.8% | +23.2% | -71.0% | +5.3% | -| RedHatAI/Qwen2.5-7B-Instruct-fp8-dynamic | 1 | 1 | 1 | 8192 | bf16 | ct | auto | -0.4% | +153.4% | -37.5% | -3.9% | -| RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8 | 1 | 1 | 1 | 8192 | bf16 | ct | auto | -0.4% | +153.4% | -40.0% | -3.9% | - -## Failed Runs - -| Model | TP | PP | DP | max_len | Notes | -|---|---|---|---|---|---| -| codellama/CodeLlama-34b-hf | 2 | 1 | 1 | 8192 | GPU contention at runtime | -| meta-llama/Llama-3.1-8B-Instruct | 1 | 1 | 2 | 8192 | DP=2 | -| meta-llama/Llama-4-Scout-17B-16E-Instruct | 4 | 1 | 1 | 8192 | | -| microsoft/phi-2 | 1 | 1 | 1 | 8192 | max_model_len=8192 > max_position_embeddings=2048; fixed with max_model_len=2048 | -| moonshotai/Kimi-Dev-72B | 4 | 1 | 1 | 8192 | second attempt; tp=2 succeeded | -| openai/gpt-oss-20b | 1 | 1 | 1 | 8192 | sampler warmup OOM (~786 MiB needed, <552 MiB free) | -| openai/gpt-oss-20b | 2 | 1 | 1 | 8192 | sampler warmup OOM at gmu=0.95; succeeded at gmu=0.90 | -| Qwen/Qwen3-14B | 5 | 1 | 1 | 8192 | tp=5 invalid (vocab not divisible by 5) | - -## Calibration decisions - -_Document constant changes here: old value → new value, evidence._ diff --git a/accuracy/results/v0.19.0/runs/meta-llama-llama-4-scout--h100-80gb--tp4pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/meta-llama-llama-4-scout--h100-80gb--tp4pp1dp1--8192.json new file mode 100644 index 00000000..659cb5cc --- /dev/null +++ b/accuracy/results/v0.19.0/runs/meta-llama-llama-4-scout--h100-80gb--tp4pp1dp1--8192.json @@ -0,0 +1,29 @@ +{ + "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 4, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-23T00:28:49.262359+00:00", + "log_path": "/data/results/v0.19.0/logs/meta-llama-llama-4-scout--h100-80gb--tp4pp1dp1--8192.log", + "weight_memory_gib": 53.12, + "kv_cache_memory_gib": 16.75, + "cuda_graph_memory_gib": 1.09, + "max_concurrency": 44.68, + "kv_cache_tokens": 365968, + "vllm_version": null, + "vllm_commit": null, + "total_non_kv_memory_gib": 58.48, + "activation_memory_gib": 3.19, + "non_torch_forward_memory_gib": 2.17, + "profiling_weights_memory_gib": 53.12, + "kv_cache_blocks": 22873, + "kv_block_size_bytes": 786305 +} \ No newline at end of file diff --git a/accuracy/results/v0.19.0/runs/qwen-qwen1-5-moe-a2-7b--h100-80gb--tp1pp1dp1--8192.json b/accuracy/results/v0.19.0/runs/qwen-qwen1-5-moe-a2-7b--h100-80gb--tp1pp1dp1--8192.json new file mode 100644 index 00000000..2b5c6519 --- /dev/null +++ b/accuracy/results/v0.19.0/runs/qwen-qwen1-5-moe-a2-7b--h100-80gb--tp1pp1dp1--8192.json @@ -0,0 +1,29 @@ +{ + "model": "Qwen/Qwen1.5-MoE-A2.7B", + "gpu": "H100-80GB", + "vllm_args": { + "tensor_parallel_size": 1, + "pipeline_parallel_size": 1, + "data_parallel_size": 1, + "max_model_len": 8192, + "gpu_memory_utilization": 0.95, + "dtype": "auto", + "quantization": null, + "kv_cache_dtype": "auto" + }, + "timestamp": "2026-04-23T01:02:23.363044+00:00", + "log_path": "/data/results/v0.19.0/logs/qwen-qwen1-5-moe-a2-7b--h100-80gb--tp1pp1dp1--8192.log", + "weight_memory_gib": 26.67, + "kv_cache_memory_gib": 45.83, + "cuda_graph_memory_gib": 0.82, + "max_concurrency": 30.55, + "kv_cache_tokens": 250304, + "vllm_version": null, + "vllm_commit": null, + "total_non_kv_memory_gib": 29.4, + "activation_memory_gib": 2.47, + "non_torch_forward_memory_gib": 0.25, + "profiling_weights_memory_gib": 26.67, + "kv_cache_blocks": 15644, + "kv_block_size_bytes": 3145588 +} \ No newline at end of file diff --git a/accuracy/results/version-sweep/run_matrix.md b/accuracy/results/version-sweep/run_matrix.md deleted file mode 100644 index df539830..00000000 --- a/accuracy/results/version-sweep/run_matrix.md +++ /dev/null @@ -1,31 +0,0 @@ -# Run Matrix — vLLM Version Sensitivity / Qwen3-14B / H100-80GB - -**Goal**: Track how activation memory reported by vLLM changes across releases, to identify -when planner constants became stale. - -**4 successful runs, 1 failed run (first attempt only).** - -Model: `Qwen/Qwen3-14B` — tp=1, pp=1, dp=1, max_model_len=8192, dtype=auto, quant=none. -All runs on a single H100-80GB at `gpu_memory_utilization=0.95`. - -## Results - -| vLLM version | Weight (GiB) | Activation (GiB) | Non-torch (GiB) | KV cache (GiB) | Max concurrency | -|:---:|:---:|:---:|:---:|:---:|:---:| -| v0.15.0 | 27.52 | 5.64 | 0.13 | 41.94 | 33.55 | -| v0.16.0 | 27.52 | 5.64 | 0.13 | 41.94 | 33.55 | -| **v0.17.0** | 27.52 | **2.23** | 0.13 | 45.34 | 36.27 | -| v0.18.0 | 27.52 | 2.23 | 0.25 | 45.23 | 36.18 | - -**Key finding**: Activation memory dropped from 5.64 GiB to 2.23 GiB (−60%) between v0.16.0 and v0.17.0. -Weight memory is stable across all versions (as expected — model parameters don't change). -KV cache increased by ~3.4 GiB at v0.17.0+ because lower activation overhead leaves more headroom. - -The planner's Qwen3 activation constant (5.60 GiB) matches v0.16.0 exactly — the constants -were calibrated against v0.16.0 or earlier. - -## Failed Runs - -| vLLM version | Notes | -|:---:|---| -| v0.16.0 (attempt 1) | GPU contention at startup: only 61.8/79.19 GiB free (needed 75.23 GiB). Succeeded on retry. | diff --git a/accuracy/scripts/sweep.yaml b/accuracy/scripts/sweep.yaml index 24594b8e..3424bcfa 100644 --- a/accuracy/scripts/sweep.yaml +++ b/accuracy/scripts/sweep.yaml @@ -95,6 +95,15 @@ runs: - model: meta-llama/Llama-4-Scout-17B-16E-Instruct # 109B total, 17B active MoE (16 experts) tp: 4 # tp=1 OOM (~212 GiB total), tp=2 OOM (~106 GiB total) + # DONE: Qwen/Qwen1.5-MoE-A2.7B tp=1 + # - model: Qwen/Qwen1.5-MoE-A2.7B # 14.3B total, 2.7B active MoE; Qwen2Moe architecture + # tp: 1 + + # UNSUPPORTED: gemma4 arch not in transformers bundled with vLLM v0.19.0 + # - model: google/gemma-4-E4B-it # MoE; Gemma4 architecture (gated) + # tp: 1 + # hf_token_secret: hf-token-gemma + # ── Gemma models ────────────────────────────────────────────────────────── # Requires hf-token-gemma secret (separate from hf-token; Gemma repos are gated). - model: google/gemma-2-2b-it # 2.6B dense; Gemma2 architecture From d3121e059da88662db39a926a3ead964be97359b Mon Sep 17 00:00:00 2001 From: Jing Chen Date: Wed, 22 Apr 2026 21:47:23 -0400 Subject: [PATCH 10/24] Add logs Signed-off-by: Jing Chen --- ...4b---h100-80gb--tp2pp1dp1--8192.FAILED.log | 308 ++ ...llama-7b-h--h100-80gb--tp1pp1dp1--8192.log | 764 +++ ...ma-3-3-70b--h100-80gb--tp2pp1dp1--8192.log | 2150 +++++++++ ...ma-3-3-70b--h100-80gb--tp4pp1dp1--8192.log | 4086 +++++++++++++++++ ...i-qwen2-5---h100-80gb--tp1pp1dp1--8192.log | 864 ++++ ...-it--h100-80gb--tp1pp1dp1--8192.FAILED.log | 133 + ...a-2-27b-it--h100-80gb--tp1pp1dp1--8192.log | 770 ++++ ...-it--h100-80gb--tp1pp1dp1--8192.FAILED.log | 133 + ...ma-2-2b-it--h100-80gb--tp1pp1dp1--8192.log | 745 +++ ...-it--h100-80gb--tp1pp1dp1--8192.FAILED.log | 133 + ...ma-2-9b-it--h100-80gb--tp1pp1dp1--8192.log | 749 +++ ...-it--h100-80gb--tp1pp1dp1--8192.FAILED.log | 133 + ...a-3-12b-it--h100-80gb--tp1pp1dp1--8192.log | 769 ++++ ...-it--h100-80gb--tp1pp1dp1--8192.FAILED.log | 133 + ...a-3-27b-it--h100-80gb--tp1pp1dp1--8192.log | 783 ++++ ...-it--h100-80gb--tp1pp1dp1--8192.FAILED.log | 133 + ...ma-3-4b-it--h100-80gb--tp1pp1dp1--8192.log | 766 +++ ...-it--h100-80gb--tp1pp1dp1--8192.FAILED.log | 77 + ...e-gemma-7b--h100-80gb--tp1pp1dp1--8192.log | 776 ++++ ...ite-3-1-2b--h100-80gb--tp1pp1dp1--8192.log | 743 +++ ...ite-3-1-8b--h100-80gb--tp1pp1dp1--8192.log | 746 +++ ...ma-3---h100-80gb--tp1pp1dp1--8192-qfp8.log | 1105 +++++ ...ma-3--h100-80gb--tp1pp1dp1--8192-dtf32.log | 750 +++ ...ma-3-1-8b--h100-80gb--tp1pp1dp1--16384.log | 746 +++ ...out--h100-80gb--tp4pp1dp1--8192.FAILED.log | 440 ++ ...ma-4-scout--h100-80gb--tp4pp1dp1--8192.log | 2342 ++++++++++ ...soft-phi-2--h100-80gb--tp1pp1dp1--2048.log | 769 ++++ ...i-2--h100-80gb--tp1pp1dp1--8192.FAILED.log | 80 + ...20b--h100-80gb--tp1pp1dp1--8192.FAILED.log | 1112 +++++ ...20b--h100-80gb--tp2pp1dp1--8192.FAILED.log | 2104 +++++++++ ...pt-oss-20b--h100-80gb--tp2pp1dp1--8192.log | 1855 ++++++++ ...-moe-a2-7b--h100-80gb--tp1pp1dp1--8192.log | 771 ++++ ...-small-24b--h100-80gb--tp1pp1dp1--8192.log | 928 ++++ ...-small-24b--h100-80gb--tp2pp1dp1--8192.log | 1847 ++++++++ ...qwen2-5-7b--h100-80gb--tp1pp1dp1--8192.log | 861 ++++ 35 files changed, 31604 insertions(+) create mode 100644 accuracy/results/v0.19.0/logs/codellama-codellama-34b---h100-80gb--tp2pp1dp1--8192.FAILED.log create mode 100644 accuracy/results/v0.19.0/logs/codellama-codellama-7b-h--h100-80gb--tp1pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/fp8dyn-llama-3-3-70b--h100-80gb--tp2pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/fp8dyn-llama-3-3-70b--h100-80gb--tp4pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/fp8dyn-redhatai-qwen2-5---h100-80gb--tp1pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/google-gemma-2-27b-it--h100-80gb--tp1pp1dp1--8192.FAILED.log create mode 100644 accuracy/results/v0.19.0/logs/google-gemma-2-27b-it--h100-80gb--tp1pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/google-gemma-2-2b-it--h100-80gb--tp1pp1dp1--8192.FAILED.log create mode 100644 accuracy/results/v0.19.0/logs/google-gemma-2-2b-it--h100-80gb--tp1pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/google-gemma-2-9b-it--h100-80gb--tp1pp1dp1--8192.FAILED.log create mode 100644 accuracy/results/v0.19.0/logs/google-gemma-2-9b-it--h100-80gb--tp1pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/google-gemma-3-12b-it--h100-80gb--tp1pp1dp1--8192.FAILED.log create mode 100644 accuracy/results/v0.19.0/logs/google-gemma-3-12b-it--h100-80gb--tp1pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/google-gemma-3-27b-it--h100-80gb--tp1pp1dp1--8192.FAILED.log create mode 100644 accuracy/results/v0.19.0/logs/google-gemma-3-27b-it--h100-80gb--tp1pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/google-gemma-3-4b-it--h100-80gb--tp1pp1dp1--8192.FAILED.log create mode 100644 accuracy/results/v0.19.0/logs/google-gemma-3-4b-it--h100-80gb--tp1pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/google-gemma-4-e4b-it--h100-80gb--tp1pp1dp1--8192.FAILED.log create mode 100644 accuracy/results/v0.19.0/logs/google-gemma-7b--h100-80gb--tp1pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/granite-3-1-2b--h100-80gb--tp1pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/granite-3-1-8b--h100-80gb--tp1pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/meta-llama-llama-3---h100-80gb--tp1pp1dp1--8192-qfp8.log create mode 100644 accuracy/results/v0.19.0/logs/meta-llama-llama-3--h100-80gb--tp1pp1dp1--8192-dtf32.log create mode 100644 accuracy/results/v0.19.0/logs/meta-llama-llama-3-1-8b--h100-80gb--tp1pp1dp1--16384.log create mode 100644 accuracy/results/v0.19.0/logs/meta-llama-llama-4-scout--h100-80gb--tp4pp1dp1--8192.FAILED.log create mode 100644 accuracy/results/v0.19.0/logs/meta-llama-llama-4-scout--h100-80gb--tp4pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/microsoft-phi-2--h100-80gb--tp1pp1dp1--2048.log create mode 100644 accuracy/results/v0.19.0/logs/microsoft-phi-2--h100-80gb--tp1pp1dp1--8192.FAILED.log create mode 100644 accuracy/results/v0.19.0/logs/openai-gpt-oss-20b--h100-80gb--tp1pp1dp1--8192.FAILED.log create mode 100644 accuracy/results/v0.19.0/logs/openai-gpt-oss-20b--h100-80gb--tp2pp1dp1--8192.FAILED.log create mode 100644 accuracy/results/v0.19.0/logs/openai-gpt-oss-20b--h100-80gb--tp2pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/qwen-qwen1-5-moe-a2-7b--h100-80gb--tp1pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/w8a8-mistral-small-24b--h100-80gb--tp1pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/w8a8-mistral-small-24b--h100-80gb--tp2pp1dp1--8192.log create mode 100644 accuracy/results/v0.19.0/logs/w8a8-redhatai-qwen2-5-7b--h100-80gb--tp1pp1dp1--8192.log diff --git a/accuracy/results/v0.19.0/logs/codellama-codellama-34b---h100-80gb--tp2pp1dp1--8192.FAILED.log b/accuracy/results/v0.19.0/logs/codellama-codellama-34b---h100-80gb--tp2pp1dp1--8192.FAILED.log new file mode 100644 index 00000000..c54e2f3c --- /dev/null +++ b/accuracy/results/v0.19.0/logs/codellama-codellama-34b---h100-80gb--tp2pp1dp1--8192.FAILED.log @@ -0,0 +1,308 @@ +DEBUG 04-22 15:13:32 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 15:13:32 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 15:13:32 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 15:13:32 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 15:13:32 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 15:13:32 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 15:13:32 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 15:13:32 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 15:13:32 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 15:13:32 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 15:13:32 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 15:13:32 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 15:13:37 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 15:13:39 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' +DEBUG 04-22 15:13:39 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 15:13:39 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 15:13:39 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 15:13:39 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 15:13:39 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 15:13:39 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 15:13:39 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 15:13:39 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model codellama/CodeLlama-34b-hf +(APIServer pid=1) INFO 04-22 15:13:39 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 15:13:39 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 15:13:39 [entrypoints/utils.py:233] non-default args: {'model_tag': 'codellama/CodeLlama-34b-hf', 'model': 'codellama/CodeLlama-34b-hf', 'max_model_len': 8192, 'tensor_parallel_size': 2, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 15:13:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 15:13:39 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 15:13:39 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003513 secs +(APIServer pid=1) INFO 04-22 15:13:39 [config/model.py:549] Resolved architecture: LlamaForCausalLM +(APIServer pid=1) INFO 04-22 15:13:39 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 15:13:39 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 15:13:39 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 15:13:39 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 15:13:39 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 15:13:39 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 15:13:39 [config/parallel.py:743] Defaulting to use mp for distributed inference +(APIServer pid=1) INFO 04-22 15:13:39 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 15:13:39 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(APIServer pid=1) INFO 04-22 15:13:40 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(APIServer pid=1) DEBUG 04-22 15:13:40 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 15:13:40 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 15:13:41 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 15:13:41 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 15:13:45 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 15:13:45 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 15:13:45 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 15:13:45 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 15:13:45 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 15:13:45 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 15:13:45 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 15:13:45 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 15:13:45 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 15:13:45 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 15:13:45 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 15:13:45 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 15:13:50 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=285) DEBUG 04-22 15:13:51 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 15:13:51 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=285) DEBUG 04-22 15:13:51 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/6031e569-78d3-4367-a319-d00f5784bdee'], outputs=['ipc:///tmp/8009dedd-8f40-47ff-b55d-2493b94b720f'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=285) DEBUG 04-22 15:13:51 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=285) DEBUG 04-22 15:13:51 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=285) DEBUG 04-22 15:13:51 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=285) DEBUG 04-22 15:13:51 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=285) DEBUG 04-22 15:13:51 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=285) INFO 04-22 15:13:51 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='codellama/CodeLlama-34b-hf', speculative_config=None, tokenizer='codellama/CodeLlama-34b-hf', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=codellama/CodeLlama-34b-hf, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [4096, 8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=285) WARNING 04-22 15:13:51 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. +(EngineCore pid=285) INFO 04-22 15:13:51 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.128.10.77 (local), world_size=2, local_world_size=2 +(EngineCore pid=285) DEBUG 04-22 15:13:51 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/102b683d-6a30-49da-beae-627ed84291b1 +(EngineCore pid=285) DEBUG 04-22 15:13:51 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1], buffer_handle=(2, 16777216, 10, 'psm_18c33fdd'), local_subscribe_addr='ipc:///tmp/102b683d-6a30-49da-beae-627ed84291b1', local_notify_addr='ipc:///tmp/92e6e20f-9b40-4887-850b-cf71b0aa0473', remote_subscribe_addr=None, remote_addr_ipv6=False) +DEBUG 04-22 15:13:55 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 15:13:55 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 15:13:55 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 15:13:55 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 15:13:55 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 15:13:55 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 15:13:55 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 15:13:55 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 15:13:55 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 15:13:55 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 15:13:55 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 15:13:55 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 15:13:55 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 15:13:55 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 15:13:55 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 15:13:55 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 15:13:55 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 15:13:55 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 15:13:55 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 15:13:55 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 15:13:55 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 15:13:55 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 15:13:55 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 15:13:55 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 15:13:59 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 15:13:59 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 15:14:01 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 15:14:01 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 15:14:01 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 15:14:01 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 15:14:01 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 15:14:01 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 15:14:01 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 15:14:01 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(Worker pid=484) DEBUG 04-22 15:14:01 [distributed/parallel_state.py:1356] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:45259 backend=nccl +(Worker pid=484) INFO 04-22 15:14:01 [distributed/parallel_state.py:1400] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:45259 backend=nccl +(APIServer pid=1) DEBUG 04-22 15:14:01 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker pid=485) DEBUG 04-22 15:14:01 [distributed/parallel_state.py:1356] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:45259 backend=nccl +(Worker pid=485) INFO 04-22 15:14:01 [distributed/parallel_state.py:1400] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:45259 backend=nccl +[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +(Worker pid=485) DEBUG 04-22 15:14:01 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=484) DEBUG 04-22 15:14:01 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +(Worker pid=485) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=485) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=484) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=484) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=484) DEBUG 04-22 15:14:02 [utils/nccl.py:34] Found nccl from library libnccl.so.2 +(Worker pid=484) INFO 04-22 15:14:02 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 +(Worker pid=485) DEBUG 04-22 15:14:02 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=484) DEBUG 04-22 15:14:02 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=484) DEBUG 04-22 15:14:02 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/6e43b6cb-dfa5-481f-9a41-9cfb8dbd05d1 +(Worker pid=484) DEBUG 04-22 15:14:02 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1], buffer_handle=(1, 4194304, 6, 'psm_832a28d6'), local_subscribe_addr='ipc:///tmp/6e43b6cb-dfa5-481f-9a41-9cfb8dbd05d1', local_notify_addr='ipc:///tmp/c880f7ba-c003-469c-b499-2c3408a7c327', remote_subscribe_addr=None, remote_addr_ipv6=False) +(Worker pid=485) DEBUG 04-22 15:14:02 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/6e43b6cb-dfa5-481f-9a41-9cfb8dbd05d1 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(Worker pid=484) INFO 04-22 15:14:02 [distributed/parallel_state.py:1716] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] WorkerProc failed to start. +(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] Traceback (most recent call last): +(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 826, in worker_main +(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] worker = WorkerProc(*args, **kwargs) +(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) +(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ +(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 605, in __init__ +(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] self.worker.init_device() +(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 312, in init_device +(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] self.worker.init_device() # type: ignore +(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) +(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ +(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 283, in init_device +(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] self.requested_memory = request_memory(init_snapshot, self.cache_config) +(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/utils.py", line 413, in request_memory +(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] raise ValueError( +(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] ValueError: Free memory on device cuda:1 (60.87/79.19 GiB) on startup is less than desired GPU memory utilization (0.95, 75.23 GiB). Decrease GPU memory utilization or reduce GPU memory used by other processes. +(EngineCore pid=285) DEBUG 04-22 15:14:02 [v1/executor/multiproc_executor.py:419] Worker Termination: allow workers to gracefully shutdown +(Worker pid=484) DEBUG 04-22 15:14:03 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.66GiB, total_memory=79.19GiB, cuda_memory=1.53GiB, torch_memory=0.02GiB, non_torch_memory=1.51GiB, timestamp=1776870843.010323, auto_measure=True +(Worker pid=484) DEBUG 04-22 15:14:03 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=484) DEBUG 04-22 15:14:03 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=484) DEBUG 04-22 15:14:03 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=484) DEBUG 04-22 15:14:03 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=484) DEBUG 04-22 15:14:03 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=484) DEBUG 04-22 15:14:03 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(Worker pid=484) DEBUG 04-22 15:14:03 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=484) DEBUG 04-22 15:14:03 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(Worker_TP0 pid=484) INFO 04-22 15:14:03 [v1/worker/gpu_model_runner.py:4735] Starting to load model codellama/CodeLlama-34b-hf... +(Worker_TP0 pid=484) DEBUG 04-22 15:14:03 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(Worker_TP0 pid=484) INFO 04-22 15:14:03 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(Worker_TP0 pid=484) INFO 04-22 15:14:03 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(Worker_TP0 pid=484) DEBUG 04-22 15:14:03 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP0 pid=484) DEBUG 04-22 15:14:03 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP0 pid=484) DEBUG 04-22 15:14:03 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 97, 'silu_and_mul': 48, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP0 pid=484) DEBUG 04-22 15:14:03 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP0 pid=484) DEBUG 04-22 15:14:04 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00007.safetensors', 'model-00007-of-00007.safetensors', 'model-00005-of-00007.safetensors', 'model-00004-of-00007.safetensors', 'model-00006-of-00007.safetensors', 'model-00003-of-00007.safetensors', 'model-00001-of-00007.safetensors']] +(EngineCore pid=285) DEBUG 04-22 15:14:06 [v1/executor/multiproc_executor.py:424] Worker Termination: workers still running sending SIGTERM +(EngineCore pid=285) DEBUG 04-22 15:14:10 [v1/executor/multiproc_executor.py:429] Worker Termination: resorting to SIGKILL to take down workers +(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] EngineCore failed to start. +(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] Traceback (most recent call last): +(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1082, in run_engine_core +(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs) +(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] return func(*args, **kwargs) +(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^ +(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 848, in __init__ +(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] super().__init__( +(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 114, in __init__ +(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] self.model_executor = executor_class(vllm_config) +(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 101, in __init__ +(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] super().__init__(vllm_config) +(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] return func(*args, **kwargs) +(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^ +(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 103, in __init__ +(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] self._init_executor() +(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 190, in _init_executor +(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] self.workers = WorkerProc.wait_for_ready(unready_workers) +(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 736, in wait_for_ready +(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] raise e from None +(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] Exception: WorkerProc initialization failed due to an exception in a background process. See stack trace for root cause. +(EngineCore pid=285) Process EngineCore: +(EngineCore pid=285) Traceback (most recent call last): +(EngineCore pid=285) File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap +(EngineCore pid=285) self.run() +(EngineCore pid=285) File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run +(EngineCore pid=285) self._target(*self._args, **self._kwargs) +(EngineCore pid=285) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1112, in run_engine_core +(EngineCore pid=285) raise e +(EngineCore pid=285) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1082, in run_engine_core +(EngineCore pid=285) engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs) +(EngineCore pid=285) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore pid=285) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(EngineCore pid=285) return func(*args, **kwargs) +(EngineCore pid=285) ^^^^^^^^^^^^^^^^^^^^^ +(EngineCore pid=285) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 848, in __init__ +(EngineCore pid=285) super().__init__( +(EngineCore pid=285) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 114, in __init__ +(EngineCore pid=285) self.model_executor = executor_class(vllm_config) +(EngineCore pid=285) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore pid=285) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 101, in __init__ +(EngineCore pid=285) super().__init__(vllm_config) +(EngineCore pid=285) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(EngineCore pid=285) return func(*args, **kwargs) +(EngineCore pid=285) ^^^^^^^^^^^^^^^^^^^^^ +(EngineCore pid=285) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 103, in __init__ +(EngineCore pid=285) self._init_executor() +(EngineCore pid=285) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 190, in _init_executor +(EngineCore pid=285) self.workers = WorkerProc.wait_for_ready(unready_workers) +(EngineCore pid=285) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore pid=285) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 736, in wait_for_ready +(EngineCore pid=285) raise e from None +(EngineCore pid=285) Exception: WorkerProc initialization failed due to an exception in a background process. See stack trace for root cause. +(EngineCore pid=285) DEBUG 04-22 15:14:11 [v1/executor/multiproc_executor.py:438] Triggering shutdown of workers +(APIServer pid=1) DEBUG 04-22 15:14:11 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) Traceback (most recent call last): +(APIServer pid=1) File "/usr/local/bin/vllm", line 10, in +(APIServer pid=1) sys.exit(main()) +(APIServer pid=1) ^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main +(APIServer pid=1) args.dispatch_function(args) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd +(APIServer pid=1) uvloop.run(run_server(args)) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run +(APIServer pid=1) return __asyncio.run( +(APIServer pid=1) ^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run +(APIServer pid=1) return runner.run(main) +(APIServer pid=1) ^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run +(APIServer pid=1) return self._loop.run_until_complete(task) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper +(APIServer pid=1) return await main +(APIServer pid=1) ^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server +(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker +(APIServer pid=1) async with build_async_engine_client( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ +(APIServer pid=1) return await anext(self.gen) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client +(APIServer pid=1) async with build_async_engine_client_from_engine_args( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ +(APIServer pid=1) return await anext(self.gen) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 136, in build_async_engine_client_from_engine_args +(APIServer pid=1) async_llm = AsyncLLM.from_vllm_config( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 225, in from_vllm_config +(APIServer pid=1) return cls( +(APIServer pid=1) ^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 154, in __init__ +(APIServer pid=1) self.engine_core = EngineCoreClient.make_async_mp_client( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(APIServer pid=1) return func(*args, **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 130, in make_async_mp_client +(APIServer pid=1) return AsyncMPClient(*client_args) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(APIServer pid=1) return func(*args, **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 887, in __init__ +(APIServer pid=1) super().__init__( +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 535, in __init__ +(APIServer pid=1) with launch_core_engines( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 144, in __exit__ +(APIServer pid=1) next(self.gen) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 998, in launch_core_engines +(APIServer pid=1) wait_for_engine_startup( +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 1057, in wait_for_engine_startup +(APIServer pid=1) raise RuntimeError( +(APIServer pid=1) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} +/usr/lib/python3.12/multiprocessing/resource_tracker.py:279: UserWarning: resource_tracker: There appear to be 1 leaked semaphore objects to clean up at shutdown + warnings.warn('resource_tracker: There appear to be %d ' +/usr/lib/python3.12/multiprocessing/resource_tracker.py:279: UserWarning: resource_tracker: There appear to be 1 leaked shared_memory objects to clean up at shutdown + warnings.warn('resource_tracker: There appear to be %d ' diff --git a/accuracy/results/v0.19.0/logs/codellama-codellama-7b-h--h100-80gb--tp1pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/codellama-codellama-7b-h--h100-80gb--tp1pp1dp1--8192.log new file mode 100644 index 00000000..c0dada63 --- /dev/null +++ b/accuracy/results/v0.19.0/logs/codellama-codellama-7b-h--h100-80gb--tp1pp1dp1--8192.log @@ -0,0 +1,764 @@ +DEBUG 04-22 15:11:52 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 15:11:52 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 15:11:52 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 15:11:52 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 15:11:53 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 15:11:53 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 15:11:53 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 15:11:53 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 15:11:53 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 15:11:53 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 15:11:53 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 15:11:53 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 15:11:58 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 15:12:00 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' +DEBUG 04-22 15:12:00 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 15:12:00 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 15:12:00 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 15:12:00 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 15:12:00 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 15:12:00 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 15:12:00 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 15:12:00 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model codellama/CodeLlama-7b-hf +(APIServer pid=1) INFO 04-22 15:12:00 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 15:12:00 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 15:12:00 [entrypoints/utils.py:233] non-default args: {'model_tag': 'codellama/CodeLlama-7b-hf', 'model': 'codellama/CodeLlama-7b-hf', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 15:12:00 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 15:12:11 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 15:12:11 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0034601 secs +(APIServer pid=1) INFO 04-22 15:12:11 [config/model.py:549] Resolved architecture: LlamaForCausalLM +(APIServer pid=1) INFO 04-22 15:12:11 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 15:12:11 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 15:12:11 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 15:12:11 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 15:12:11 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 15:12:11 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 15:12:11 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 15:12:11 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 15:12:12 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 15:12:12 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 15:12:13 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 15:12:13 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 15:12:17 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 15:12:17 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 15:12:17 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 15:12:17 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 15:12:17 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 15:12:17 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 15:12:17 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 15:12:17 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 15:12:17 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 15:12:17 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 15:12:17 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 15:12:17 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 15:12:21 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=283) DEBUG 04-22 15:12:23 [v1/engine/core.py:1018] Waiting for init message from front-end. +(EngineCore pid=283) DEBUG 04-22 15:12:23 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/c0d6876b-7742-4367-b174-65491d8d5016'], outputs=['ipc:///tmp/1fb926e6-0134-40d0-9a6b-b4cc9cc044c7'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(APIServer pid=1) DEBUG 04-22 15:12:23 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=283) DEBUG 04-22 15:12:23 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=283) DEBUG 04-22 15:12:23 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=283) DEBUG 04-22 15:12:23 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=283) DEBUG 04-22 15:12:23 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=283) DEBUG 04-22 15:12:23 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=283) INFO 04-22 15:12:23 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='codellama/CodeLlama-7b-hf', speculative_config=None, tokenizer='codellama/CodeLlama-7b-hf', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=codellama/CodeLlama-7b-hf, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=283) DEBUG 04-22 15:12:23 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.10.75:52901 backend=nccl +(EngineCore pid=283) INFO 04-22 15:12:23 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.10.75:52901 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=283) DEBUG 04-22 15:12:23 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=283) INFO 04-22 15:12:23 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=283) DEBUG 04-22 15:12:24 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776870744.2147915, auto_measure=True +(EngineCore pid=283) DEBUG 04-22 15:12:24 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=283) DEBUG 04-22 15:12:24 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=283) DEBUG 04-22 15:12:24 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=283) DEBUG 04-22 15:12:24 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=283) DEBUG 04-22 15:12:24 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=283) DEBUG 04-22 15:12:24 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=283) DEBUG 04-22 15:12:24 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=283) DEBUG 04-22 15:12:24 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=283) INFO 04-22 15:12:24 [v1/worker/gpu_model_runner.py:4735] Starting to load model codellama/CodeLlama-7b-hf... +(EngineCore pid=283) DEBUG 04-22 15:12:25 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=283) INFO 04-22 15:12:25 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=283) INFO 04-22 15:12:25 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=283) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=283) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=283) DEBUG 04-22 15:12:25 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=283) DEBUG 04-22 15:12:25 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=283) DEBUG 04-22 15:12:25 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=283) DEBUG 04-22 15:12:25 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=283) DEBUG 04-22 15:12:25 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00002.safetensors', 'model-00001-of-00002.safetensors']] +(APIServer pid=1) DEBUG 04-22 15:12:33 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 15:12:43 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=283) INFO 04-22 15:12:51 [model_executor/model_loader/weight_utils.py:581] Time spent downloading weights for codellama/CodeLlama-7b-hf: 25.845622 seconds +(EngineCore pid=283) Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00 +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=283) INFO 04-22 15:13:07 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/b9a1c77cfa/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=b999767673 comp=e546579c48 code=d4902a9f99bbc2392c45e4b4dc801f4424a11306edb95a61851b1062db20b8c4 dir=/data/.cache/vllm/torch_compile_cache/b9a1c77cfa/rank_0_0/backbone +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] Vllm config hash: b999767673 +(EngineCore pid=283) INFO 04-22 15:13:07 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.37 s +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=283) DEBUG 04-22 15:13:08 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=283) DEBUG 04-22 15:13:08 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(EngineCore pid=283) DEBUG 04-22 15:13:08 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.0 ms +(EngineCore pid=283) DEBUG 04-22 15:13:08 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=283) DEBUG 04-22 15:13:08 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=283) INFO 04-22 15:13:09 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=283) DEBUG 04-22 15:13:09 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/b9a1c77cfa/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=283) DEBUG 04-22 15:13:09 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=283) DEBUG 04-22 15:13:09 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(EngineCore pid=283) DEBUG 04-22 15:13:09 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms +(EngineCore pid=283) DEBUG 04-22 15:13:09 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=283) DEBUG 04-22 15:13:09 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=283) DEBUG 04-22 15:13:10 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/b9a1c77cfa/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=283) DEBUG 04-22 15:13:11 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=283) DEBUG 04-22 15:13:11 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.1 ms +(EngineCore pid=283) DEBUG 04-22 15:13:11 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.0 ms +(EngineCore pid=283) DEBUG 04-22 15:13:11 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=283) DEBUG 04-22 15:13:11 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(EngineCore pid=283) DEBUG 04-22 15:13:12 [compilation/backends.py:377] Store the 32-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_32', '/data/.cache/vllm/torch_compile_cache/b9a1c77cfa/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_32') +(EngineCore pid=283) INFO 04-22 15:13:12 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 4.21 s +(EngineCore pid=283) DEBUG 04-22 15:13:12 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/b9a1c77cfa/rank_0_0/backbone/computation_graph.py +(APIServer pid=1) DEBUG 04-22 15:13:13 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=283) INFO 04-22 15:13:13 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/07d551a6df0697f257924722aaa2124b2a7dd3a0ce7f90b96a420893bbcca842/rank_0_0/model +(EngineCore pid=283) INFO 04-22 15:13:13 [compilation/monitor.py:48] torch.compile took 10.16 s in total +(EngineCore pid=283) INFO 04-22 15:13:13 [compilation/monitor.py:76] Initial profiling/warmup run took 0.45 s +(EngineCore pid=283) INFO 04-22 15:13:19 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=283) DEBUG 04-22 15:13:19 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=283) INFO 04-22 15:13:19 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=283) DEBUG 04-22 15:13:19 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=283) DEBUG 04-22 15:13:19 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=283) DEBUG 04-22 15:13:19 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=283) DEBUG 04-22 15:13:19 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=283) DEBUG 04-22 15:13:19 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=283) DEBUG 04-22 15:13:19 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=283) DEBUG 04-22 15:13:19 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 122.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=283) DEBUG 04-22 15:13:19 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=283) DEBUG 04-22 15:13:19 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=283) DEBUG 04-22 15:13:19 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=283) DEBUG 04-22 15:13:19 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=283) DEBUG 04-22 15:13:19 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=283) DEBUG 04-22 15:13:19 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=283) DEBUG 04-22 15:13:19 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 262.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=283) DEBUG 04-22 15:13:20 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=283) INFO 04-22 15:13:20 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.84 GiB total +(EngineCore pid=283) DEBUG 04-22 15:13:20 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=283) DEBUG 04-22 15:13:20 [v1/worker/gpu_worker.py:430] Free memory after profiling: 65.49 GiB (total), 62.05 GiB (within requested) +(EngineCore pid=283) DEBUG 04-22 15:13:20 [v1/worker/gpu_worker.py:435] Memory profiling takes 17.55 seconds. Total non KV cache memory: 13.57GiB; torch peak memory increase: 0.77GiB; non-torch forward increase memory: 0.25GiB; weights memory: 12.56GiB. +(EngineCore pid=283) INFO 04-22 15:13:20 [v1/worker/gpu_worker.py:436] Available KV cache memory: 61.66 GiB +(EngineCore pid=283) INFO 04-22 15:13:20 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9606 to maintain the same effective KV cache size. +(EngineCore pid=283) INFO 04-22 15:13:20 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 126,256 tokens +(EngineCore pid=283) INFO 04-22 15:13:20 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 15.41x +(EngineCore pid=283) 2026-04-22 15:13:20,851 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=283) DEBUG 04-22 15:13:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=283) 2026-04-22 15:13:20,863 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=283) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:16:12 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:16:12 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 00:16:12 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:16:12 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 00:16:12 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 00:16:12 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic +(APIServer pid=1) INFO 04-22 00:16:12 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 00:16:12 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:16:12 [entrypoints/utils.py:233] non-default args: {'model_tag': 'RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic', 'model': 'RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic', 'max_model_len': 8192, 'tensor_parallel_size': 2, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 00:16:12 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 00:16:12 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 00:16:12 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003880 secs +(APIServer pid=1) INFO 04-22 00:16:12 [config/model.py:549] Resolved architecture: LlamaForCausalLM +(APIServer pid=1) INFO 04-22 00:16:12 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 00:16:13 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 00:16:13 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 00:16:13 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 00:16:13 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 00:16:13 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 00:16:13 [config/parallel.py:743] Defaulting to use mp for distributed inference +(APIServer pid=1) INFO 04-22 00:16:13 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 00:16:13 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(APIServer pid=1) INFO 04-22 00:16:14 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(APIServer pid=1) DEBUG 04-22 00:16:14 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 00:16:14 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:16:14 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:16:14 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 00:16:18 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:16:18 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:16:18 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:16:18 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:16:18 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:16:18 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:16:18 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:16:18 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:16:18 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:16:18 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:16:18 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:16:18 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:16:23 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(APIServer pid=1) DEBUG 04-22 00:16:24 [v1/engine/utils.py:1042] Waiting for 1 local, 0 remote core engine proc(s) to connect. +(EngineCore pid=245) DEBUG 04-22 00:16:25 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 00:16:25 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=245) DEBUG 04-22 00:16:25 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/8fd259a3-1a8f-44e1-b97e-568878a2a7bb'], outputs=['ipc:///tmp/95ad2f05-64c8-4329-b07c-dda78cdfb113'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=245) DEBUG 04-22 00:16:25 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=245) DEBUG 04-22 00:16:25 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=245) DEBUG 04-22 00:16:25 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=245) DEBUG 04-22 00:16:25 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=245) DEBUG 04-22 00:16:25 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=245) INFO 04-22 00:16:25 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic', speculative_config=None, tokenizer='RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [4096, 8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=245) WARNING 04-22 00:16:25 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. +(EngineCore pid=245) INFO 04-22 00:16:25 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.128.4.190 (local), world_size=2, local_world_size=2 +(EngineCore pid=245) DEBUG 04-22 00:16:25 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/586b3818-4b38-4ebe-9921-e3c3607e8f9d +(EngineCore pid=245) DEBUG 04-22 00:16:25 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1], buffer_handle=(2, 16777216, 10, 'psm_63d3722a'), local_subscribe_addr='ipc:///tmp/586b3818-4b38-4ebe-9921-e3c3607e8f9d', local_notify_addr='ipc:///tmp/1c30918c-7bce-4425-b471-562d86f610dd', remote_subscribe_addr=None, remote_addr_ipv6=False) +DEBUG 04-22 00:16:28 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:16:28 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:16:28 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:16:28 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:16:28 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:16:28 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:16:28 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:16:28 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:16:28 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:16:28 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:16:28 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:16:28 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:16:28 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:16:28 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:16:28 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:16:28 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:16:28 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:16:28 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:16:28 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:16:28 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:16:28 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:16:28 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:16:28 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:16:28 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:16:33 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 00:16:33 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 00:16:34 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 00:16:34 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:16:34 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:16:34 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 00:16:34 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 00:16:34 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:16:34 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:16:34 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) DEBUG 04-22 00:16:35 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker pid=444) DEBUG 04-22 00:16:35 [distributed/parallel_state.py:1356] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:36569 backend=nccl +(Worker pid=444) INFO 04-22 00:16:35 [distributed/parallel_state.py:1400] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:36569 backend=nccl +(Worker pid=445) DEBUG 04-22 00:16:35 [distributed/parallel_state.py:1356] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:36569 backend=nccl +(Worker pid=445) INFO 04-22 00:16:35 [distributed/parallel_state.py:1400] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:36569 backend=nccl +[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +(Worker pid=445) DEBUG 04-22 00:16:35 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=444) DEBUG 04-22 00:16:35 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +(Worker pid=445) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=445) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=444) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=444) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=444) DEBUG 04-22 00:16:35 [utils/nccl.py:34] Found nccl from library libnccl.so.2 +(Worker pid=444) INFO 04-22 00:16:35 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 +(Worker pid=445) DEBUG 04-22 00:16:36 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=444) DEBUG 04-22 00:16:36 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=444) DEBUG 04-22 00:16:36 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/2abe8c31-ff70-4e4d-90a9-e0ab7800b406 +(Worker pid=444) DEBUG 04-22 00:16:36 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1], buffer_handle=(1, 4194304, 6, 'psm_ff754c24'), local_subscribe_addr='ipc:///tmp/2abe8c31-ff70-4e4d-90a9-e0ab7800b406', local_notify_addr='ipc:///tmp/efd5d6a7-a676-41f7-a221-0335ab138965', remote_subscribe_addr=None, remote_addr_ipv6=False) +(Worker pid=445) DEBUG 04-22 00:16:36 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/2abe8c31-ff70-4e4d-90a9-e0ab7800b406 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(Worker pid=444) INFO 04-22 00:16:36 [distributed/parallel_state.py:1716] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(Worker pid=445) DEBUG 04-22 00:16:36 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.66GiB, total_memory=79.19GiB, cuda_memory=1.53GiB, torch_memory=0.02GiB, non_torch_memory=1.51GiB, timestamp=1776816996.768284, auto_measure=True +(Worker pid=445) DEBUG 04-22 00:16:36 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=444) DEBUG 04-22 00:16:36 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.66GiB, total_memory=79.19GiB, cuda_memory=1.53GiB, torch_memory=0.02GiB, non_torch_memory=1.51GiB, timestamp=1776816996.8347344, auto_measure=True +(Worker pid=444) DEBUG 04-22 00:16:36 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=445) DEBUG 04-22 00:16:36 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=445) DEBUG 04-22 00:16:36 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=445) DEBUG 04-22 00:16:36 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=445) DEBUG 04-22 00:16:36 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=445) DEBUG 04-22 00:16:36 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=444) DEBUG 04-22 00:16:36 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=444) DEBUG 04-22 00:16:36 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=444) DEBUG 04-22 00:16:36 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=444) DEBUG 04-22 00:16:36 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=444) DEBUG 04-22 00:16:36 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(Worker pid=444) DEBUG 04-22 00:16:36 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=444) DEBUG 04-22 00:16:37 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(Worker_TP0 pid=444) INFO 04-22 00:16:37 [v1/worker/gpu_model_runner.py:4735] Starting to load model RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic... +(Worker_TP0 pid=444) INFO 04-22 00:16:37 [model_executor/.../linear/__init__.py:261] Selected CutlassFP8ScaledMMLinearKernel for CompressedTensorsW8A8Fp8 +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.self_attn.o_proj +(Worker_TP0 pid=444) INFO 04-22 00:16:37 [utils/deep_gemm.py:115] DeepGEMM E8M0 enabled on current platform. +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(Worker_TP0 pid=444) INFO 04-22 00:16:37 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(Worker_TP0 pid=444) INFO 04-22 00:16:37 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.self_attn.qkv_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.self_attn.o_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.mlp.gate_up_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.mlp.down_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.self_attn.qkv_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.self_attn.o_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.mlp.gate_up_proj +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.mlp.down_proj +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [config/compilation.py:1195] disabled custom ops: Counter({'quant_fp8': 320, 'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [config/compilation.py:1195] disabled custom ops: Counter({'quant_fp8': 320, 'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00013-of-00015.safetensors', 'model-00004-of-00015.safetensors', 'model-00006-of-00015.safetensors', 'model-00005-of-00015.safetensors', 'model-00009-of-00015.safetensors', 'model-00008-of-00015.safetensors', 'model-00014-of-00015.safetensors', 'model-00007-of-00015.safetensors', 'model-00012-of-00015.safetensors', 'model-00010-of-00015.safetensors', 'model-00011-of-00015.safetensors', 'model-00015-of-00015.safetensors', 'model-00002-of-00015.safetensors', 'model-00001-of-00015.safetensors', 'model-00003-of-00015.safetensors']] +(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00014-of-00015.safetensors', 'model-00011-of-00015.safetensors', 'model-00001-of-00015.safetensors', 'model-00012-of-00015.safetensors', 'model-00009-of-00015.safetensors', 'model-00004-of-00015.safetensors', 'model-00013-of-00015.safetensors', 'model-00003-of-00015.safetensors', 'model-00008-of-00015.safetensors', 'model-00010-of-00015.safetensors', 'model-00007-of-00015.safetensors', 'model-00015-of-00015.safetensors', 'model-00005-of-00015.safetensors', 'model-00006-of-00015.safetensors', 'model-00002-of-00015.safetensors']] +(Worker_TP0 pid=444) Loading safetensors checkpoint shards: 0% Completed | 0/15 [00:00 +(Worker_TP0 pid=444) DEBUG 04-22 00:17:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 00:17:32 [compilation/decorators.py:528] Start compiling function +(EngineCore pid=245) DEBUG 04-22 00:17:33 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(APIServer pid=1) DEBUG 04-22 00:17:35 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 00:17:45 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/input_quant_fp8.py +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/utils/quant_utils.py +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/input_quant_fp8.py +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/utils/quant_utils.py +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=53000d16bd comp=e546579c48 code=f156083917c384d07960be9da52b3831ca85a8c6cca0b42b018e2920d91ddd8b dir=/data/.cache/vllm/torch_compile_cache/815459ed8e/rank_1_0/backbone +(Worker_TP0 pid=444) INFO 04-22 00:17:49 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/815459ed8e/rank_0_0/backbone for vLLM's torch.compile +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=53000d16bd comp=e546579c48 code=f156083917c384d07960be9da52b3831ca85a8c6cca0b42b018e2920d91ddd8b dir=/data/.cache/vllm/torch_compile_cache/815459ed8e/rank_0_0/backbone +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] Vllm config hash: 53000d16bd +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] Vllm config hash: 53000d16bd +(Worker_TP0 pid=444) INFO 04-22 00:17:49 [compilation/backends.py:1111] Dynamo bytecode transform time: 16.88 s +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/.../fusion/allreduce_rms_fusion.py:765] Flashinfer max size: 64 MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: 4096 +(Worker_TP0 pid=444) INFO 04-22 00:17:49 [distributed/device_communicators/flashinfer_all_reduce.py:109] Auto-selected flashinfer allreduce backend: trtllm +(Worker_TP0 pid=444) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. +(Worker_TP0 pid=444) return func(*args, **kwargs) +(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=0, max_token_num=4096, hidden_dim=8192, dtype=torch.bfloat16 +(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=1, max_token_num=4096, hidden_dim=8192, dtype=torch.bfloat16 +(Worker_TP0 pid=444) INFO 04-22 00:17:49 [distributed/device_communicators/flashinfer_all_reduce.py:149] Initialized FlashInfer Allreduce norm fusion workspace with backend=trtllm +(Worker_TP0 pid=444) DEBUG 04-22 00:17:50 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 4096), (4097, 8192)] +(Worker_TP0 pid=444) DEBUG 04-22 00:17:50 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(Worker_TP0 pid=444) DEBUG 04-22 00:17:51 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=444) DEBUG 04-22 00:17:51 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP1 pid=445) DEBUG 04-22 00:17:51 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=445) DEBUG 04-22 00:17:51 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP0 pid=444) DEBUG 04-22 00:17:51 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns +(Worker_TP0 pid=444) DEBUG 04-22 00:17:51 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 24.3 ms +(Worker_TP0 pid=444) DEBUG 04-22 00:17:51 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP0 pid=444) DEBUG 04-22 00:17:51 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes +(Worker_TP0 pid=444) DEBUG 04-22 00:17:51 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP1 pid=445) DEBUG 04-22 00:17:51 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns +(Worker_TP1 pid=445) DEBUG 04-22 00:17:51 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 24.2 ms +(Worker_TP1 pid=445) DEBUG 04-22 00:17:51 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP1 pid=445) DEBUG 04-22 00:17:51 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes +(Worker_TP1 pid=445) DEBUG 04-22 00:17:51 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP0 pid=444) INFO 04-22 00:17:54 [compilation/backends.py:372] Cache the graph of compile range (1, 4096) for later use +(Worker_TP0 pid=444) DEBUG 04-22 00:17:54 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 4096) from inductor_standalone via handle ('artifact_compile_range_1_4096_subgraph_0', '/data/.cache/vllm/torch_compile_cache/815459ed8e/rank_0_0/backbone/artifact_compile_range_1_4096_subgraph_0') +(Worker_TP0 pid=444) DEBUG 04-22 00:17:54 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=444) DEBUG 04-22 00:17:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP0 pid=444) DEBUG 04-22 00:17:54 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) +(Worker_TP0 pid=444) DEBUG 04-22 00:17:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP0 pid=444) DEBUG 04-22 00:17:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=444) DEBUG 04-22 00:17:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP1 pid=445) DEBUG 04-22 00:17:54 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=445) DEBUG 04-22 00:17:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP1 pid=445) DEBUG 04-22 00:17:54 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) +(Worker_TP1 pid=445) DEBUG 04-22 00:17:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.5 ms +(Worker_TP1 pid=445) DEBUG 04-22 00:17:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=445) DEBUG 04-22 00:17:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(APIServer pid=1) DEBUG 04-22 00:17:55 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=444) INFO 04-22 00:17:55 [compilation/backends.py:372] Cache the graph of compile range (4097, 8192) for later use +(Worker_TP0 pid=444) DEBUG 04-22 00:17:55 [compilation/backends.py:377] Store the 0-th graph for compile range(4097, 8192) from inductor_standalone via handle ('artifact_compile_range_4097_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/815459ed8e/rank_0_0/backbone/artifact_compile_range_4097_8192_subgraph_0') +(Worker_TP0 pid=444) DEBUG 04-22 00:17:56 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=444) DEBUG 04-22 00:17:56 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP1 pid=445) DEBUG 04-22 00:17:56 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=445) DEBUG 04-22 00:17:56 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP0 pid=444) DEBUG 04-22 00:17:56 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP0 pid=444) DEBUG 04-22 00:17:56 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.1 ms +(Worker_TP0 pid=444) DEBUG 04-22 00:17:56 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.5 ms +(Worker_TP0 pid=444) DEBUG 04-22 00:17:56 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP0 pid=444) DEBUG 04-22 00:17:56 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP1 pid=445) DEBUG 04-22 00:17:56 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP1 pid=445) DEBUG 04-22 00:17:56 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.5 ms +(Worker_TP1 pid=445) DEBUG 04-22 00:17:56 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.5 ms +(Worker_TP1 pid=445) DEBUG 04-22 00:17:56 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP1 pid=445) DEBUG 04-22 00:17:56 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP0 pid=444) DEBUG 04-22 00:17:58 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 4096) from inductor_standalone via handle ('artifact_compile_range_1_4096_subgraph_1', '/data/.cache/vllm/torch_compile_cache/815459ed8e/rank_0_0/backbone/artifact_compile_range_1_4096_subgraph_1') +(Worker_TP0 pid=444) DEBUG 04-22 00:17:59 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=444) DEBUG 04-22 00:17:59 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP0 pid=444) DEBUG 04-22 00:17:59 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) +(Worker_TP0 pid=444) DEBUG 04-22 00:17:59 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.5 ms +(Worker_TP0 pid=444) DEBUG 04-22 00:17:59 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=444) DEBUG 04-22 00:17:59 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.8 ms +(Worker_TP1 pid=445) DEBUG 04-22 00:17:59 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=445) DEBUG 04-22 00:17:59 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP1 pid=445) DEBUG 04-22 00:17:59 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) +(Worker_TP1 pid=445) DEBUG 04-22 00:17:59 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.5 ms +(Worker_TP1 pid=445) DEBUG 04-22 00:17:59 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=445) DEBUG 04-22 00:17:59 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=444) DEBUG 04-22 00:18:00 [compilation/backends.py:377] Store the 1-th graph for compile range(4097, 8192) from inductor_standalone via handle ('artifact_compile_range_4097_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/815459ed8e/rank_0_0/backbone/artifact_compile_range_4097_8192_subgraph_1') +(APIServer pid=1) DEBUG 04-22 00:18:05 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=444) DEBUG 04-22 00:18:06 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=444) DEBUG 04-22 00:18:06 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP1 pid=445) DEBUG 04-22 00:18:06 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=445) DEBUG 04-22 00:18:06 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP0 pid=444) DEBUG 04-22 00:18:06 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP0 pid=444) DEBUG 04-22 00:18:06 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.3 ms +(Worker_TP0 pid=444) DEBUG 04-22 00:18:06 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=444) DEBUG 04-22 00:18:06 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP0 pid=444) DEBUG 04-22 00:18:06 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP1 pid=445) DEBUG 04-22 00:18:06 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP1 pid=445) DEBUG 04-22 00:18:06 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.8 ms +(Worker_TP1 pid=445) DEBUG 04-22 00:18:06 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=445) DEBUG 04-22 00:18:06 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP1 pid=445) DEBUG 04-22 00:18:06 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP1 pid=445) DEBUG 04-22 00:18:07 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=445) DEBUG 04-22 00:18:07 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP1 pid=445) DEBUG 04-22 00:18:07 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) +(Worker_TP1 pid=445) DEBUG 04-22 00:18:07 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=445) DEBUG 04-22 00:18:07 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=445) DEBUG 04-22 00:18:07 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP0 pid=444) DEBUG 04-22 00:18:07 [compilation/backends.py:377] Store the 80-th graph for compile range(1, 4096) from inductor_standalone via handle ('artifact_compile_range_1_4096_subgraph_80', '/data/.cache/vllm/torch_compile_cache/815459ed8e/rank_0_0/backbone/artifact_compile_range_1_4096_subgraph_80') +(Worker_TP0 pid=444) INFO 04-22 00:18:07 [compilation/backends.py:390] Compiling a graph for compile range (1, 4096) takes 13.82 s +(Worker_TP0 pid=444) DEBUG 04-22 00:18:08 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=444) DEBUG 04-22 00:18:08 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP0 pid=444) DEBUG 04-22 00:18:08 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) +(Worker_TP0 pid=444) DEBUG 04-22 00:18:08 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=444) DEBUG 04-22 00:18:08 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=444) DEBUG 04-22 00:18:08 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP0 pid=444) DEBUG 04-22 00:18:09 [compilation/backends.py:377] Store the 80-th graph for compile range(4097, 8192) from inductor_standalone via handle ('artifact_compile_range_4097_8192_subgraph_80', '/data/.cache/vllm/torch_compile_cache/815459ed8e/rank_0_0/backbone/artifact_compile_range_4097_8192_subgraph_80') +(Worker_TP0 pid=444) INFO 04-22 00:18:09 [compilation/backends.py:390] Compiling a graph for compile range (4097, 8192) takes 15.00 s +(Worker_TP0 pid=444) DEBUG 04-22 00:18:09 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/815459ed8e/rank_0_0/backbone/computation_graph.py +(Worker_TP0 pid=444) INFO 04-22 00:18:13 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/46f5ec07a78f772bd5aa151dedfba97679c60533c06813f17f54411b7bba24b7/rank_0_0/model +(Worker_TP0 pid=444) INFO 04-22 00:18:13 [compilation/monitor.py:48] torch.compile took 41.44 s in total +(Worker_TP0 pid=444) INFO 04-22 00:18:14 [compilation/monitor.py:76] Initial profiling/warmup run took 0.91 s +(APIServer pid=1) DEBUG 04-22 00:18:15 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP1 pid=445) INFO 04-22 00:18:20 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP1 pid=445) DEBUG 04-22 00:18:20 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP1 pid=445) INFO 04-22 00:18:20 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_TP0 pid=444) INFO 04-22 00:18:20 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP0 pid=444) DEBUG 04-22 00:18:20 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP0 pid=444) INFO 04-22 00:18:20 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_TP1 pid=445) DEBUG 04-22 00:18:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 00:18:21 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=445) DEBUG 04-22 00:18:21 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=444) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 00:18:21 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=445) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 00:18:21 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=445) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 108.00 MiB first-capture + (51-1) × 20.00 MiB per-graph +(Worker_TP0 pid=444) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 108.00 MiB first-capture + (51-1) × 20.00 MiB per-graph +(Worker_TP1 pid=445) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 00:18:21 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=445) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 00:18:21 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=444) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) DEBUG 04-22 00:18:21 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=445) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 00:18:21 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=444) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 268.00 MiB first-capture + (51-1) × 12.00 MiB per-graph +(Worker_TP0 pid=444) INFO 04-22 00:18:21 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses +(Worker_TP1 pid=445) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 268.00 MiB first-capture + (51-1) × 12.00 MiB per-graph +(Worker_TP1 pid=445) INFO 04-22 00:18:21 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses +(Worker_TP1 pid=445) DEBUG 04-22 00:18:22 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP1 pid=445) INFO 04-22 00:18:22 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.82 GiB total +(Worker_TP0 pid=444) DEBUG 04-22 00:18:22 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP0 pid=444) INFO 04-22 00:18:22 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.82 GiB total +(Worker_TP1 pid=445) DEBUG 04-22 00:18:23 [v1/worker/gpu_worker.py:424] Initial free memory: 77.66 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP1 pid=445) DEBUG 04-22 00:18:23 [v1/worker/gpu_worker.py:430] Free memory after profiling: 41.35 GiB (total), 38.92 GiB (within requested) +(Worker_TP1 pid=445) DEBUG 04-22 00:18:23 [v1/worker/gpu_worker.py:435] Memory profiling takes 50.75 seconds. Total non KV cache memory: 37.95GiB; torch peak memory increase: 1.96GiB; non-torch forward increase memory: 2.1GiB; weights memory: 33.88GiB. +(Worker_TP1 pid=445) INFO 04-22 00:18:23 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9730 to maintain the same effective KV cache size. +(Worker_TP1 pid=445) DEBUG 04-22 00:18:23 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=444) DEBUG 04-22 00:18:23 [v1/worker/gpu_worker.py:424] Initial free memory: 77.66 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP0 pid=444) DEBUG 04-22 00:18:23 [v1/worker/gpu_worker.py:430] Free memory after profiling: 41.35 GiB (total), 38.92 GiB (within requested) +(Worker_TP0 pid=444) DEBUG 04-22 00:18:23 [v1/worker/gpu_worker.py:435] Memory profiling takes 50.74 seconds. Total non KV cache memory: 37.95GiB; torch peak memory increase: 1.96GiB; non-torch forward increase memory: 2.1GiB; weights memory: 33.88GiB. +(Worker_TP0 pid=444) INFO 04-22 00:18:23 [v1/worker/gpu_worker.py:436] Available KV cache memory: 37.28 GiB +(Worker_TP0 pid=444) INFO 04-22 00:18:23 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9730 to maintain the same effective KV cache size. +(Worker_TP0 pid=444) DEBUG 04-22 00:18:23 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=245) DEBUG 04-22 00:18:23 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=245) INFO 04-22 00:18:23 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 244,304 tokens +(EngineCore pid=245) INFO 04-22 00:18:23 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 29.82x +(Worker_TP0 pid=444) DEBUG 04-22 00:18:23 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=445) DEBUG 04-22 00:18:23 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=445) INFO 04-22 00:18:23 [v1/worker/gpu_worker.py:578] Compile and warming up model for size 8192 +(Worker_TP0 pid=444) INFO 04-22 00:18:23 [v1/worker/gpu_worker.py:578] Compile and warming up model for size 8192 +(Worker_TP0 pid=444) DEBUG 04-22 00:18:23 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 00:18:23 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=444) 2026-04-22 00:18:23,183 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP1 pid=445) 2026-04-22 00:18:23,183 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP0 pid=444) DEBUG 04-22 00:18:23 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) DEBUG 04-22 00:18:23 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=445) 2026-04-22 00:18:23,737 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP0 pid=444) 2026-04-22 00:18:23,737 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=245) DEBUG 04-22 00:18:24 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=444) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=245) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=245) INFO 04-22 00:18:33 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(Worker_TP0 pid=444) DEBUG 04-22 00:18:33 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=445) DEBUG 04-22 00:18:33 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=245) DEBUG 04-22 00:18:33 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=245) DEBUG 04-22 00:18:33 [v1/engine/core.py:1158] EngineCore waiting for work. +(EngineCore pid=245) DEBUG 04-22 00:18:33 [v1/engine/core.py:1158] EngineCore waiting for work. +(APIServer pid=1) INFO 04-22 00:18:33 [entrypoints/openai/api_server.py:590] Supported tasks: ['generate'] +(APIServer pid=1) WARNING 04-22 00:18:34 [config/model.py:1435] Default vLLM sampling parameters have been overridden by the model's `generation_config.json`: `{'temperature': 0.6, 'top_p': 0.9}`. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`. +(APIServer pid=1) DEBUG 04-22 00:18:34 [renderers/base.py:197] Warming up chat template processing... +(APIServer pid=1) DEBUG 04-22 00:18:34 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e813da-230200740f79100a4c5b90d4;a4ef458b-27aa-4ba5-8bb8-4615bcbc9448) +(APIServer pid=1) DEBUG 04-22 00:18:34 [transformers_utils/repo_utils.py:243] +(APIServer pid=1) DEBUG 04-22 00:18:34 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic/resolve/main/processor_config.json. +(APIServer pid=1) DEBUG 04-22 00:18:34 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e813da-115de46779174b0167c84f5d;86639a92-2492-4faf-9dd4-35b86bad219a) +(APIServer pid=1) DEBUG 04-22 00:18:34 [transformers_utils/repo_utils.py:243] +(APIServer pid=1) DEBUG 04-22 00:18:34 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic/resolve/main/preprocessor_config.json. +(Worker_TP1 pid=445) DEBUG 04-22 00:18:34 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=444) DEBUG 04-22 00:18:34 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(APIServer pid=1) INFO 04-22 00:18:35 [renderers/hf.py:314] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this. +(APIServer pid=1) DEBUG 04-22 00:18:35 [renderers/base.py:203] Chat template warmup completed in 1.424s +(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/openai/api_server.py:594] Starting vLLM server on http://0.0.0.0:8000 +(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:37] Available routes are: +(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /openapi.json, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /docs, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /docs/oauth2-redirect, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /redoc, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /tokenize, Methods: POST +(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /detokenize, Methods: POST +(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /load, Methods: GET +(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /version, Methods: GET +(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /health, Methods: GET +(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /metrics, Methods: GET +(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /v1/models, Methods: GET +(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /ping, Methods: GET +(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /ping, Methods: POST +(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /invocations, Methods: POST +(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /v1/chat/completions, Methods: POST +(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST +(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /v1/responses, Methods: POST +(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET +(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST +(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /v1/completions, Methods: POST +(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /v1/messages, Methods: POST +(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST +(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /inference/v1/generate, Methods: POST +(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /scale_elastic_ep, Methods: POST +(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST +(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /v1/chat/completions/render, Methods: POST +(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /v1/completions/render, Methods: POST +(APIServer pid=1) INFO: Started server process [1] +(APIServer pid=1) INFO: Waiting for application startup. +(APIServer pid=1) INFO: Application startup complete. +(APIServer pid=1) DEBUG 04-22 00:18:41 [v1/engine/async_llm.py:875] Called check_health. +(APIServer pid=1) INFO: 10.128.4.2:46564 - "GET /health HTTP/1.1" 200 OK diff --git a/accuracy/results/v0.19.0/logs/fp8dyn-llama-3-3-70b--h100-80gb--tp4pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/fp8dyn-llama-3-3-70b--h100-80gb--tp4pp1dp1--8192.log new file mode 100644 index 00000000..b4c1e019 --- /dev/null +++ b/accuracy/results/v0.19.0/logs/fp8dyn-llama-3-3-70b--h100-80gb--tp4pp1dp1--8192.log @@ -0,0 +1,4086 @@ +DEBUG 04-22 00:18:46 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:18:46 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:18:46 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:18:46 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:18:46 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:18:46 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:18:46 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:18:46 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:18:46 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:18:46 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:18:46 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:18:46 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:18:51 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 00:18:52 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' +DEBUG 04-22 00:18:52 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 00:18:52 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:18:52 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:18:52 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 00:18:52 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:18:52 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 00:18:52 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 00:18:52 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic +(APIServer pid=1) INFO 04-22 00:18:52 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 00:18:52 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:18:52 [entrypoints/utils.py:233] non-default args: {'model_tag': 'RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic', 'model': 'RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic', 'max_model_len': 8192, 'tensor_parallel_size': 4, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 00:18:52 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 00:18:53 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 00:18:53 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003907 secs +(APIServer pid=1) INFO 04-22 00:18:53 [config/model.py:549] Resolved architecture: LlamaForCausalLM +(APIServer pid=1) INFO 04-22 00:18:53 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 00:18:53 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 00:18:53 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 00:18:53 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 00:18:53 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 00:18:53 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 00:18:53 [config/parallel.py:743] Defaulting to use mp for distributed inference +(APIServer pid=1) INFO 04-22 00:18:53 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 00:18:53 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(APIServer pid=1) INFO 04-22 00:18:55 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(APIServer pid=1) DEBUG 04-22 00:18:55 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 00:18:55 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:18:56 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:18:56 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 00:19:00 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:19:00 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:19:00 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:19:00 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:19:00 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:19:00 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:19:00 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:19:00 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:19:00 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:19:00 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:19:00 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:19:00 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:19:05 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(APIServer pid=1) DEBUG 04-22 00:19:06 [v1/engine/utils.py:1042] Waiting for 1 local, 0 remote core engine proc(s) to connect. +(EngineCore pid=243) DEBUG 04-22 00:19:06 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 00:19:06 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=243) DEBUG 04-22 00:19:06 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/759dcae2-a02d-4b3e-ae72-e93c4fadf53c'], outputs=['ipc:///tmp/9fb9ed15-dbde-4df5-92aa-a30bae7a9025'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=243) DEBUG 04-22 00:19:06 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=243) DEBUG 04-22 00:19:06 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=243) DEBUG 04-22 00:19:06 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=243) DEBUG 04-22 00:19:06 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=243) DEBUG 04-22 00:19:06 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=243) INFO 04-22 00:19:06 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic', speculative_config=None, tokenizer='RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=4, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [128, 8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=243) WARNING 04-22 00:19:06 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. +(EngineCore pid=243) INFO 04-22 00:19:06 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.128.4.191 (local), world_size=4, local_world_size=4 +(EngineCore pid=243) DEBUG 04-22 00:19:06 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/0924f52e-4b2c-4b6b-ad61-2dc2800e583c +(EngineCore pid=243) DEBUG 04-22 00:19:06 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1, 2, 3], buffer_handle=(4, 16777216, 10, 'psm_af2d89f2'), local_subscribe_addr='ipc:///tmp/0924f52e-4b2c-4b6b-ad61-2dc2800e583c', local_notify_addr='ipc:///tmp/81423ce9-3958-4734-a217-e6b87223c977', remote_subscribe_addr=None, remote_addr_ipv6=False) +DEBUG 04-22 00:19:10 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:19:10 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:19:10 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:19:10 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:19:10 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:19:10 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:19:10 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:19:10 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:19:10 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:19:10 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:19:10 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:19:10 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:19:10 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:19:10 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:19:10 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:19:10 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:19:10 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:19:10 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:19:10 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:19:10 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:19:10 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:19:10 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:19:10 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:19:10 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:19:10 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:19:10 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:19:10 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:19:10 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:19:10 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:19:10 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:19:10 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:19:10 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:19:10 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:19:10 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:19:10 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:19:10 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:19:10 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:19:10 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:19:10 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:19:10 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:19:10 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:19:10 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:19:10 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:19:10 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:19:10 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:19:10 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:19:10 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:19:10 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:19:15 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 00:19:15 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 00:19:15 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 00:19:15 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 00:19:16 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 00:19:16 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:19:16 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:19:16 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) DEBUG 04-22 00:19:16 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +DEBUG 04-22 00:19:16 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 00:19:16 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:19:16 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:19:16 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 00:19:16 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 00:19:16 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:19:16 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:19:16 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 00:19:17 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 00:19:17 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:19:17 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:19:17 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(Worker pid=444) DEBUG 04-22 00:19:17 [distributed/parallel_state.py:1356] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:39521 backend=nccl +(Worker pid=444) INFO 04-22 00:19:17 [distributed/parallel_state.py:1400] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:39521 backend=nccl +(Worker pid=442) DEBUG 04-22 00:19:17 [distributed/parallel_state.py:1356] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:39521 backend=nccl +(Worker pid=442) INFO 04-22 00:19:17 [distributed/parallel_state.py:1400] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:39521 backend=nccl +(Worker pid=443) DEBUG 04-22 00:19:17 [distributed/parallel_state.py:1356] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:39521 backend=nccl +(Worker pid=443) INFO 04-22 00:19:17 [distributed/parallel_state.py:1400] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:39521 backend=nccl +(Worker pid=445) DEBUG 04-22 00:19:17 [distributed/parallel_state.py:1356] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:39521 backend=nccl +(Worker pid=445) INFO 04-22 00:19:17 [distributed/parallel_state.py:1400] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:39521 backend=nccl +[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +(Worker pid=443) DEBUG 04-22 00:19:17 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=442) DEBUG 04-22 00:19:17 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=444) DEBUG 04-22 00:19:17 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=445) DEBUG 04-22 00:19:17 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +(Worker pid=442) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=444) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=442) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=444) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=443) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=443) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=442) DEBUG 04-22 00:19:18 [utils/nccl.py:34] Found nccl from library libnccl.so.2 +(Worker pid=442) INFO 04-22 00:19:18 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 +(Worker pid=445) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=445) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=444) DEBUG 04-22 00:19:20 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=443) DEBUG 04-22 00:19:20 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=445) DEBUG 04-22 00:19:20 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=442) DEBUG 04-22 00:19:20 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=442) DEBUG 04-22 00:19:20 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/4b28722e-c8bc-4a68-8db1-1fe76bb69325 +(Worker pid=442) DEBUG 04-22 00:19:20 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1, 2, 3], buffer_handle=(3, 4194304, 6, 'psm_ac4a1702'), local_subscribe_addr='ipc:///tmp/4b28722e-c8bc-4a68-8db1-1fe76bb69325', local_notify_addr='ipc:///tmp/260a4a4f-1f33-4aa9-bef9-48d4c19cb46c', remote_subscribe_addr=None, remote_addr_ipv6=False) +(Worker pid=444) DEBUG 04-22 00:19:20 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/4b28722e-c8bc-4a68-8db1-1fe76bb69325 +(Worker pid=443) DEBUG 04-22 00:19:20 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/4b28722e-c8bc-4a68-8db1-1fe76bb69325 +(Worker pid=445) DEBUG 04-22 00:19:20 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/4b28722e-c8bc-4a68-8db1-1fe76bb69325 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(Worker pid=442) INFO 04-22 00:19:20 [distributed/parallel_state.py:1716] rank 0 in world size 4 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(Worker pid=445) DEBUG 04-22 00:19:20 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776817160.5383635, auto_measure=True +(Worker pid=445) DEBUG 04-22 00:19:20 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=443) DEBUG 04-22 00:19:20 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776817160.5518556, auto_measure=True +(Worker pid=443) DEBUG 04-22 00:19:20 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=442) DEBUG 04-22 00:19:20 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776817160.553573, auto_measure=True +(Worker pid=442) DEBUG 04-22 00:19:20 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=444) DEBUG 04-22 00:19:20 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776817160.5982003, auto_measure=True +(Worker pid=444) DEBUG 04-22 00:19:20 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=445) DEBUG 04-22 00:19:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=445) DEBUG 04-22 00:19:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=445) DEBUG 04-22 00:19:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=442) DEBUG 04-22 00:19:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=442) DEBUG 04-22 00:19:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=443) DEBUG 04-22 00:19:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=443) DEBUG 04-22 00:19:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=442) DEBUG 04-22 00:19:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=443) DEBUG 04-22 00:19:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=445) DEBUG 04-22 00:19:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=442) DEBUG 04-22 00:19:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=445) DEBUG 04-22 00:19:20 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=443) DEBUG 04-22 00:19:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=444) DEBUG 04-22 00:19:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=442) DEBUG 04-22 00:19:20 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(Worker pid=444) DEBUG 04-22 00:19:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=442) DEBUG 04-22 00:19:20 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=444) DEBUG 04-22 00:19:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=443) DEBUG 04-22 00:19:20 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=444) DEBUG 04-22 00:19:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=444) DEBUG 04-22 00:19:20 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=442) DEBUG 04-22 00:19:20 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(Worker_TP0 pid=442) INFO 04-22 00:19:20 [v1/worker/gpu_model_runner.py:4735] Starting to load model RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic... +(Worker_TP3 pid=445) DEBUG 04-22 00:19:20 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:20 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.self_attn.o_proj +(Worker_TP0 pid=442) INFO 04-22 00:19:21 [model_executor/.../linear/__init__.py:261] Selected CutlassFP8ScaledMMLinearKernel for CompressedTensorsW8A8Fp8 +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.self_attn.o_proj +(Worker_TP0 pid=442) INFO 04-22 00:19:21 [utils/deep_gemm.py:115] DeepGEMM E8M0 enabled on current platform. +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(Worker_TP0 pid=442) INFO 04-22 00:19:21 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(Worker_TP0 pid=442) INFO 04-22 00:19:21 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.self_attn.qkv_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.mlp.gate_up_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.self_attn.o_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:22 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.mlp.gate_up_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.self_attn.qkv_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.self_attn.qkv_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.self_attn.o_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.mlp.gate_up_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:22 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP3 pid=445) DEBUG 04-22 00:19:22 [config/compilation.py:1195] disabled custom ops: Counter({'quant_fp8': 320, 'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.mlp.down_proj +(Worker_TP3 pid=445) DEBUG 04-22 00:19:22 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.self_attn.o_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.mlp.down_proj +(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.mlp.down_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.self_attn.qkv_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.self_attn.o_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.mlp.gate_up_proj +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.mlp.down_proj +(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [config/compilation.py:1195] disabled custom ops: Counter({'quant_fp8': 320, 'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [config/compilation.py:1195] disabled custom ops: Counter({'quant_fp8': 320, 'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [config/compilation.py:1195] disabled custom ops: Counter({'quant_fp8': 320, 'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP3 pid=445) DEBUG 04-22 00:19:22 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00010-of-00015.safetensors', 'model-00006-of-00015.safetensors', 'model-00007-of-00015.safetensors', 'model-00002-of-00015.safetensors', 'model-00011-of-00015.safetensors', 'model-00012-of-00015.safetensors', 'model-00003-of-00015.safetensors', 'model-00001-of-00015.safetensors', 'model-00009-of-00015.safetensors', 'model-00014-of-00015.safetensors', 'model-00004-of-00015.safetensors', 'model-00015-of-00015.safetensors', 'model-00013-of-00015.safetensors', 'model-00005-of-00015.safetensors', 'model-00008-of-00015.safetensors']] +(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00011-of-00015.safetensors', 'model-00014-of-00015.safetensors', 'model-00009-of-00015.safetensors', 'model-00002-of-00015.safetensors', 'model-00010-of-00015.safetensors', 'model-00001-of-00015.safetensors', 'model-00004-of-00015.safetensors', 'model-00015-of-00015.safetensors', 'model-00005-of-00015.safetensors', 'model-00006-of-00015.safetensors', 'model-00007-of-00015.safetensors', 'model-00013-of-00015.safetensors', 'model-00012-of-00015.safetensors', 'model-00003-of-00015.safetensors', 'model-00008-of-00015.safetensors']] +(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00006-of-00015.safetensors', 'model-00009-of-00015.safetensors', 'model-00002-of-00015.safetensors', 'model-00011-of-00015.safetensors', 'model-00013-of-00015.safetensors', 'model-00007-of-00015.safetensors', 'model-00008-of-00015.safetensors', 'model-00005-of-00015.safetensors', 'model-00012-of-00015.safetensors', 'model-00014-of-00015.safetensors', 'model-00010-of-00015.safetensors', 'model-00003-of-00015.safetensors', 'model-00015-of-00015.safetensors', 'model-00001-of-00015.safetensors', 'model-00004-of-00015.safetensors']] +(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00012-of-00015.safetensors', 'model-00004-of-00015.safetensors', 'model-00005-of-00015.safetensors', 'model-00015-of-00015.safetensors', 'model-00002-of-00015.safetensors', 'model-00013-of-00015.safetensors', 'model-00001-of-00015.safetensors', 'model-00010-of-00015.safetensors', 'model-00014-of-00015.safetensors', 'model-00003-of-00015.safetensors', 'model-00008-of-00015.safetensors', 'model-00009-of-00015.safetensors', 'model-00011-of-00015.safetensors', 'model-00006-of-00015.safetensors', 'model-00007-of-00015.safetensors']] +(Worker_TP0 pid=442) Loading safetensors checkpoint shards: 0% Completed | 0/15 [00:00 +(Worker_TP0 pid=442) DEBUG 04-22 00:19:37 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) DEBUG 04-22 00:19:37 [compilation/decorators.py:528] Start compiling function +(Worker_TP2 pid=444) DEBUG 04-22 00:19:37 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=443) DEBUG 04-22 00:19:37 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=444) DEBUG 04-22 00:19:37 [compilation/decorators.py:528] Start compiling function +(Worker_TP1 pid=443) DEBUG 04-22 00:19:37 [compilation/decorators.py:528] Start compiling function +(APIServer pid=1) DEBUG 04-22 00:19:46 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/input_quant_fp8.py +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/utils/quant_utils.py +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/input_quant_fp8.py +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/utils/quant_utils.py +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=cb10c79acc comp=e546579c48 code=f156083917c384d07960be9da52b3831ca85a8c6cca0b42b018e2920d91ddd8b dir=/data/.cache/vllm/torch_compile_cache/f06e0c9df2/rank_3_0/backbone +(Worker_TP0 pid=442) INFO 04-22 00:19:54 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/f06e0c9df2/rank_0_0/backbone for vLLM's torch.compile +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=cb10c79acc comp=e546579c48 code=f156083917c384d07960be9da52b3831ca85a8c6cca0b42b018e2920d91ddd8b dir=/data/.cache/vllm/torch_compile_cache/f06e0c9df2/rank_0_0/backbone +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] Vllm config hash: cb10c79acc +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] Vllm config hash: cb10c79acc +(Worker_TP0 pid=442) INFO 04-22 00:19:54 [compilation/backends.py:1111] Dynamo bytecode transform time: 17.21 s +(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/.../fusion/allreduce_rms_fusion.py:765] Flashinfer max size: 2 MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: 128 +(Worker_TP0 pid=442) INFO 04-22 00:19:54 [distributed/device_communicators/flashinfer_all_reduce.py:109] Auto-selected flashinfer allreduce backend: trtllm +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/input_quant_fp8.py +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/utils/quant_utils.py +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=cb10c79acc comp=e546579c48 code=f156083917c384d07960be9da52b3831ca85a8c6cca0b42b018e2920d91ddd8b dir=/data/.cache/vllm/torch_compile_cache/f06e0c9df2/rank_2_0/backbone +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] Vllm config hash: cb10c79acc +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/input_quant_fp8.py +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/utils/quant_utils.py +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=cb10c79acc comp=e546579c48 code=f156083917c384d07960be9da52b3831ca85a8c6cca0b42b018e2920d91ddd8b dir=/data/.cache/vllm/torch_compile_cache/f06e0c9df2/rank_1_0/backbone +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] Vllm config hash: cb10c79acc +(Worker_TP0 pid=442) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. +(Worker_TP0 pid=442) return func(*args, **kwargs) +(Worker_TP1 pid=443) DEBUG 04-22 00:19:55 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=1, max_token_num=128, hidden_dim=8192, dtype=torch.bfloat16 +(Worker_TP2 pid=444) DEBUG 04-22 00:19:55 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=2, max_token_num=128, hidden_dim=8192, dtype=torch.bfloat16 +(Worker_TP0 pid=442) DEBUG 04-22 00:19:55 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=0, max_token_num=128, hidden_dim=8192, dtype=torch.bfloat16 +(Worker_TP3 pid=445) DEBUG 04-22 00:19:55 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=3, max_token_num=128, hidden_dim=8192, dtype=torch.bfloat16 +(Worker_TP0 pid=442) INFO 04-22 00:19:55 [distributed/device_communicators/flashinfer_all_reduce.py:149] Initialized FlashInfer Allreduce norm fusion workspace with backend=trtllm +(Worker_TP0 pid=442) DEBUG 04-22 00:19:56 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 128), (129, 8192)] +(Worker_TP0 pid=442) DEBUG 04-22 00:19:56 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(APIServer pid=1) DEBUG 04-22 00:19:56 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP1 pid=443) DEBUG 04-22 00:19:56 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=443) DEBUG 04-22 00:19:56 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:19:56 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns +(Worker_TP1 pid=443) DEBUG 04-22 00:19:56 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 24.7 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:19:56 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:19:56 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes +(Worker_TP1 pid=443) DEBUG 04-22 00:19:56 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP3 pid=445) DEBUG 04-22 00:19:56 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP3 pid=445) DEBUG 04-22 00:19:56 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP3 pid=445) DEBUG 04-22 00:19:56 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns +(Worker_TP3 pid=445) DEBUG 04-22 00:19:56 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 25.7 ms +(Worker_TP3 pid=445) DEBUG 04-22 00:19:56 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP3 pid=445) DEBUG 04-22 00:19:56 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes +(Worker_TP3 pid=445) DEBUG 04-22 00:19:56 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP2 pid=444) DEBUG 04-22 00:19:56 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP2 pid=444) DEBUG 04-22 00:19:56 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:19:56 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=442) DEBUG 04-22 00:19:56 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP2 pid=444) DEBUG 04-22 00:19:56 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns +(Worker_TP2 pid=444) DEBUG 04-22 00:19:56 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 25.0 ms +(Worker_TP2 pid=444) DEBUG 04-22 00:19:56 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP2 pid=444) DEBUG 04-22 00:19:56 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes +(Worker_TP2 pid=444) DEBUG 04-22 00:19:56 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:19:56 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns +(Worker_TP0 pid=442) DEBUG 04-22 00:19:56 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 25.1 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:19:56 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:19:56 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes +(Worker_TP0 pid=442) DEBUG 04-22 00:19:56 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP0 pid=442) INFO 04-22 00:19:59 [compilation/backends.py:372] Cache the graph of compile range (1, 128) for later use +(Worker_TP0 pid=442) DEBUG 04-22 00:19:59 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 128) from inductor_standalone via handle ('artifact_compile_range_1_128_subgraph_0', '/data/.cache/vllm/torch_compile_cache/f06e0c9df2/rank_0_0/backbone/artifact_compile_range_1_128_subgraph_0') +(Worker_TP1 pid=443) DEBUG 04-22 00:20:00 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=443) DEBUG 04-22 00:20:00 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:20:00 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) +(Worker_TP1 pid=443) DEBUG 04-22 00:20:00 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:20:00 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=443) DEBUG 04-22 00:20:00 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP2 pid=444) DEBUG 04-22 00:20:00 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP2 pid=444) DEBUG 04-22 00:20:00 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP2 pid=444) DEBUG 04-22 00:20:00 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) +(Worker_TP0 pid=442) DEBUG 04-22 00:20:00 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=442) DEBUG 04-22 00:20:00 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP2 pid=444) DEBUG 04-22 00:20:00 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:20:00 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) +(Worker_TP2 pid=444) DEBUG 04-22 00:20:00 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP2 pid=444) DEBUG 04-22 00:20:00 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:20:00 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:20:00 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=442) DEBUG 04-22 00:20:00 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP3 pid=445) DEBUG 04-22 00:20:00 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP3 pid=445) DEBUG 04-22 00:20:00 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP3 pid=445) DEBUG 04-22 00:20:00 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) +(Worker_TP3 pid=445) DEBUG 04-22 00:20:00 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP3 pid=445) DEBUG 04-22 00:20:00 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP3 pid=445) DEBUG 04-22 00:20:00 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=442) INFO 04-22 00:20:01 [compilation/backends.py:372] Cache the graph of compile range (129, 8192) for later use +(Worker_TP0 pid=442) DEBUG 04-22 00:20:01 [compilation/backends.py:377] Store the 0-th graph for compile range(129, 8192) from inductor_standalone via handle ('artifact_compile_range_129_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/f06e0c9df2/rank_0_0/backbone/artifact_compile_range_129_8192_subgraph_0') +(Worker_TP0 pid=442) DEBUG 04-22 00:20:02 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=442) DEBUG 04-22 00:20:02 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP2 pid=444) DEBUG 04-22 00:20:02 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP2 pid=444) DEBUG 04-22 00:20:02 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:20:02 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP0 pid=442) DEBUG 04-22 00:20:02 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.6 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:20:02 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.5 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:20:02 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP0 pid=442) DEBUG 04-22 00:20:02 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP2 pid=444) DEBUG 04-22 00:20:02 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP2 pid=444) DEBUG 04-22 00:20:02 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 40.2 ms +(Worker_TP2 pid=444) DEBUG 04-22 00:20:02 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.5 ms +(Worker_TP2 pid=444) DEBUG 04-22 00:20:02 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP2 pid=444) DEBUG 04-22 00:20:02 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP3 pid=445) DEBUG 04-22 00:20:02 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP3 pid=445) DEBUG 04-22 00:20:02 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms +(Worker_TP3 pid=445) DEBUG 04-22 00:20:02 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP3 pid=445) DEBUG 04-22 00:20:02 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 41.6 ms +(Worker_TP3 pid=445) DEBUG 04-22 00:20:02 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.5 ms +(Worker_TP3 pid=445) DEBUG 04-22 00:20:02 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP3 pid=445) DEBUG 04-22 00:20:02 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:20:03 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=443) DEBUG 04-22 00:20:03 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:20:03 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP1 pid=443) DEBUG 04-22 00:20:03 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 40.3 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:20:03 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.5 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:20:03 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP1 pid=443) DEBUG 04-22 00:20:03 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:20:04 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 128) from inductor_standalone via handle ('artifact_compile_range_1_128_subgraph_1', '/data/.cache/vllm/torch_compile_cache/f06e0c9df2/rank_0_0/backbone/artifact_compile_range_1_128_subgraph_1') +(Worker_TP0 pid=442) DEBUG 04-22 00:20:05 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=442) DEBUG 04-22 00:20:05 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:20:05 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) +(Worker_TP0 pid=442) DEBUG 04-22 00:20:05 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.5 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:20:05 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=443) DEBUG 04-22 00:20:05 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:20:05 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=443) DEBUG 04-22 00:20:05 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) +(Worker_TP0 pid=442) DEBUG 04-22 00:20:05 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:20:05 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.5 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:20:05 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=443) DEBUG 04-22 00:20:05 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP2 pid=444) DEBUG 04-22 00:20:05 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP2 pid=444) DEBUG 04-22 00:20:05 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP2 pid=444) DEBUG 04-22 00:20:05 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) +(Worker_TP2 pid=444) DEBUG 04-22 00:20:05 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.7 ms +(Worker_TP2 pid=444) DEBUG 04-22 00:20:05 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP2 pid=444) DEBUG 04-22 00:20:05 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP3 pid=445) DEBUG 04-22 00:20:05 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP3 pid=445) DEBUG 04-22 00:20:05 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms +(Worker_TP3 pid=445) DEBUG 04-22 00:20:05 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) +(Worker_TP3 pid=445) DEBUG 04-22 00:20:05 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.6 ms +(Worker_TP3 pid=445) DEBUG 04-22 00:20:05 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP3 pid=445) DEBUG 04-22 00:20:05 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:20:06 [compilation/backends.py:377] Store the 1-th graph for compile range(129, 8192) from inductor_standalone via handle ('artifact_compile_range_129_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/f06e0c9df2/rank_0_0/backbone/artifact_compile_range_129_8192_subgraph_1') +(APIServer pid=1) DEBUG 04-22 00:20:06 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=442) DEBUG 04-22 00:20:12 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=442) DEBUG 04-22 00:20:12 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:20:12 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP0 pid=442) DEBUG 04-22 00:20:12 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 40.6 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:20:12 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:20:12 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP0 pid=442) DEBUG 04-22 00:20:12 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:20:12 [compilation/backends.py:377] Store the 80-th graph for compile range(1, 128) from inductor_standalone via handle ('artifact_compile_range_1_128_subgraph_80', '/data/.cache/vllm/torch_compile_cache/f06e0c9df2/rank_0_0/backbone/artifact_compile_range_1_128_subgraph_80') +(Worker_TP0 pid=442) INFO 04-22 00:20:12 [compilation/backends.py:390] Compiling a graph for compile range (1, 128) takes 13.04 s +(Worker_TP0 pid=442) DEBUG 04-22 00:20:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=442) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:20:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=442) DEBUG 04-22 00:20:13 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) +(Worker_TP1 pid=443) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:20:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=442) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:20:13 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP1 pid=443) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.8 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:20:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP1 pid=443) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP2 pid=444) DEBUG 04-22 00:20:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP2 pid=444) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP2 pid=444) DEBUG 04-22 00:20:13 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP2 pid=444) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 40.6 ms +(Worker_TP2 pid=444) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP2 pid=444) DEBUG 04-22 00:20:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP2 pid=444) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP3 pid=445) DEBUG 04-22 00:20:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP3 pid=445) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP3 pid=445) DEBUG 04-22 00:20:13 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP3 pid=445) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.5 ms +(Worker_TP3 pid=445) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP3 pid=445) DEBUG 04-22 00:20:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP3 pid=445) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:20:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=443) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:20:13 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) +(Worker_TP1 pid=443) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=443) DEBUG 04-22 00:20:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=443) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP2 pid=444) DEBUG 04-22 00:20:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP2 pid=444) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP2 pid=444) DEBUG 04-22 00:20:13 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) +(Worker_TP2 pid=444) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP2 pid=444) DEBUG 04-22 00:20:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP2 pid=444) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:20:13 [compilation/backends.py:377] Store the 80-th graph for compile range(129, 8192) from inductor_standalone via handle ('artifact_compile_range_129_8192_subgraph_80', '/data/.cache/vllm/torch_compile_cache/f06e0c9df2/rank_0_0/backbone/artifact_compile_range_129_8192_subgraph_80') +(Worker_TP0 pid=442) INFO 04-22 00:20:13 [compilation/backends.py:390] Compiling a graph for compile range (129, 8192) takes 14.17 s +(Worker_TP3 pid=445) DEBUG 04-22 00:20:14 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP3 pid=445) DEBUG 04-22 00:20:14 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP3 pid=445) DEBUG 04-22 00:20:14 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) +(Worker_TP3 pid=445) DEBUG 04-22 00:20:14 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP3 pid=445) DEBUG 04-22 00:20:14 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP3 pid=445) DEBUG 04-22 00:20:14 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP0 pid=442) DEBUG 04-22 00:20:14 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/f06e0c9df2/rank_0_0/backbone/computation_graph.py +(APIServer pid=1) DEBUG 04-22 00:20:16 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=442) INFO 04-22 00:20:19 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/484cf5817fb0bcdb65e1788392f05da3276f6ad8c6f5b65d8c5aef577cc4f5f9/rank_0_0/model +(Worker_TP0 pid=442) INFO 04-22 00:20:19 [compilation/monitor.py:48] torch.compile took 42.22 s in total +(Worker_TP0 pid=442) INFO 04-22 00:20:20 [compilation/monitor.py:76] Initial profiling/warmup run took 1.12 s +(Worker_TP1 pid=443) INFO 04-22 00:20:26 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP1 pid=443) DEBUG 04-22 00:20:26 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP1 pid=443) INFO 04-22 00:20:26 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_TP3 pid=445) INFO 04-22 00:20:26 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP3 pid=445) DEBUG 04-22 00:20:26 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP3 pid=445) INFO 04-22 00:20:26 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_TP0 pid=442) INFO 04-22 00:20:26 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP0 pid=442) DEBUG 04-22 00:20:26 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP0 pid=442) INFO 04-22 00:20:26 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(APIServer pid=1) DEBUG 04-22 00:20:26 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP2 pid=444) INFO 04-22 00:20:27 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP2 pid=444) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP2 pid=444) INFO 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_TP1 pid=443) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=445) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=444) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=444) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=443) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) DEBUG 04-22 00:20:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP2 pid=444) DEBUG 04-22 00:20:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=443) DEBUG 04-22 00:20:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP3 pid=445) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=445) DEBUG 04-22 00:20:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP2 pid=444) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=443) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=445) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=444) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=444) DEBUG 04-22 00:20:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=442) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) DEBUG 04-22 00:20:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=443) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=443) DEBUG 04-22 00:20:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP3 pid=445) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=445) DEBUG 04-22 00:20:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP2 pid=444) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 98.00 MiB first-capture + (51-1) × 18.00 MiB per-graph +(Worker_TP2 pid=444) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 98.00 MiB first-capture + (51-1) × 18.00 MiB per-graph +(Worker_TP0 pid=442) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=443) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 98.00 MiB first-capture + (51-1) × 18.00 MiB per-graph +(Worker_TP1 pid=443) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=445) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 98.00 MiB first-capture + (51-1) × 18.00 MiB per-graph +(Worker_TP3 pid=445) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) DEBUG 04-22 00:20:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=443) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=444) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=443) DEBUG 04-22 00:20:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP2 pid=444) DEBUG 04-22 00:20:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP3 pid=445) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=445) DEBUG 04-22 00:20:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP2 pid=444) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=443) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=445) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=444) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=444) DEBUG 04-22 00:20:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=442) DEBUG 04-22 00:20:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=443) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=443) DEBUG 04-22 00:20:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP3 pid=445) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=445) DEBUG 04-22 00:20:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP2 pid=444) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 142.00 MiB first-capture + (51-1) × 14.00 MiB per-graph +(Worker_TP2 pid=444) INFO 04-22 00:20:27 [distributed/device_communicators/custom_all_reduce.py:216] Registering 322 cuda graph addresses +(Worker_TP0 pid=442) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 142.00 MiB first-capture + (51-1) × 14.00 MiB per-graph +(Worker_TP0 pid=442) INFO 04-22 00:20:27 [distributed/device_communicators/custom_all_reduce.py:216] Registering 322 cuda graph addresses +(Worker_TP1 pid=443) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 142.00 MiB first-capture + (51-1) × 14.00 MiB per-graph +(Worker_TP1 pid=443) INFO 04-22 00:20:27 [distributed/device_communicators/custom_all_reduce.py:216] Registering 322 cuda graph addresses +(Worker_TP3 pid=445) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 142.00 MiB first-capture + (51-1) × 14.00 MiB per-graph +(Worker_TP3 pid=445) INFO 04-22 00:20:27 [distributed/device_communicators/custom_all_reduce.py:216] Registering 322 cuda graph addresses +(Worker_TP3 pid=445) DEBUG 04-22 00:20:28 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP3 pid=445) INFO 04-22 00:20:28 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.70 GiB total +(Worker_TP2 pid=444) DEBUG 04-22 00:20:28 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP1 pid=443) DEBUG 04-22 00:20:28 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP2 pid=444) INFO 04-22 00:20:28 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.70 GiB total +(Worker_TP1 pid=443) INFO 04-22 00:20:28 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.70 GiB total +(Worker_TP0 pid=442) DEBUG 04-22 00:20:28 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP0 pid=442) INFO 04-22 00:20:28 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.70 GiB total +(Worker_TP3 pid=445) DEBUG 04-22 00:20:29 [v1/worker/gpu_worker.py:424] Initial free memory: 77.64 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP3 pid=445) DEBUG 04-22 00:20:29 [v1/worker/gpu_worker.py:430] Free memory after profiling: 58.07 GiB (total), 55.66 GiB (within requested) +(Worker_TP3 pid=445) DEBUG 04-22 00:20:29 [v1/worker/gpu_worker.py:435] Memory profiling takes 52.07 seconds. Total non KV cache memory: 21.14GiB; torch peak memory increase: 1.97GiB; non-torch forward increase memory: 2.21GiB; weights memory: 16.96GiB. +(Worker_TP3 pid=445) INFO 04-22 00:20:29 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9715 to maintain the same effective KV cache size. +(Worker_TP3 pid=445) DEBUG 04-22 00:20:29 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP2 pid=444) DEBUG 04-22 00:20:29 [v1/worker/gpu_worker.py:424] Initial free memory: 77.64 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP2 pid=444) DEBUG 04-22 00:20:29 [v1/worker/gpu_worker.py:430] Free memory after profiling: 58.07 GiB (total), 55.66 GiB (within requested) +(Worker_TP2 pid=444) DEBUG 04-22 00:20:29 [v1/worker/gpu_worker.py:435] Memory profiling takes 52.09 seconds. Total non KV cache memory: 21.14GiB; torch peak memory increase: 1.97GiB; non-torch forward increase memory: 2.21GiB; weights memory: 16.96GiB. +(Worker_TP2 pid=444) INFO 04-22 00:20:29 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9715 to maintain the same effective KV cache size. +(Worker_TP2 pid=444) DEBUG 04-22 00:20:29 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=442) DEBUG 04-22 00:20:29 [v1/worker/gpu_worker.py:424] Initial free memory: 77.64 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP0 pid=442) DEBUG 04-22 00:20:29 [v1/worker/gpu_worker.py:430] Free memory after profiling: 58.07 GiB (total), 55.66 GiB (within requested) +(Worker_TP0 pid=442) DEBUG 04-22 00:20:29 [v1/worker/gpu_worker.py:435] Memory profiling takes 52.17 seconds. Total non KV cache memory: 21.14GiB; torch peak memory increase: 1.97GiB; non-torch forward increase memory: 2.21GiB; weights memory: 16.96GiB. +(Worker_TP0 pid=442) INFO 04-22 00:20:29 [v1/worker/gpu_worker.py:436] Available KV cache memory: 54.09 GiB +(Worker_TP0 pid=442) INFO 04-22 00:20:29 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9715 to maintain the same effective KV cache size. +(Worker_TP0 pid=442) DEBUG 04-22 00:20:29 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=243) DEBUG 04-22 00:20:29 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=243) DEBUG 04-22 00:20:29 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=443) DEBUG 04-22 00:20:29 [v1/worker/gpu_worker.py:424] Initial free memory: 77.64 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP1 pid=443) DEBUG 04-22 00:20:29 [v1/worker/gpu_worker.py:430] Free memory after profiling: 58.07 GiB (total), 55.66 GiB (within requested) +(Worker_TP1 pid=443) DEBUG 04-22 00:20:29 [v1/worker/gpu_worker.py:435] Memory profiling takes 52.12 seconds. Total non KV cache memory: 21.14GiB; torch peak memory increase: 1.97GiB; non-torch forward increase memory: 2.21GiB; weights memory: 16.96GiB. +(Worker_TP1 pid=443) INFO 04-22 00:20:29 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9715 to maintain the same effective KV cache size. +(EngineCore pid=243) DEBUG 04-22 00:20:29 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=443) DEBUG 04-22 00:20:29 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=243) INFO 04-22 00:20:29 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 708,992 tokens +(EngineCore pid=243) INFO 04-22 00:20:29 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 86.55x +(Worker_TP1 pid=443) DEBUG 04-22 00:20:29 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=442) DEBUG 04-22 00:20:29 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP2 pid=444) DEBUG 04-22 00:20:29 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP3 pid=445) DEBUG 04-22 00:20:29 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=442) 2026-04-22 00:20:29,522 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP3 pid=445) 2026-04-22 00:20:29,522 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP1 pid=443) 2026-04-22 00:20:29,522 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP2 pid=444) 2026-04-22 00:20:29,522 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP0 pid=442) DEBUG 04-22 00:20:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=443) DEBUG 04-22 00:20:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=444) DEBUG 04-22 00:20:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=445) DEBUG 04-22 00:20:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) 2026-04-22 00:20:29,542 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP1 pid=443) 2026-04-22 00:20:29,542 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP2 pid=444) 2026-04-22 00:20:29,543 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP3 pid=445) 2026-04-22 00:20:29,543 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP3 pid=445) DEBUG 04-22 00:20:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=444) DEBUG 04-22 00:20:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=443) DEBUG 04-22 00:20:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=442) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=243) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=243) INFO 04-22 00:20:38 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(Worker_TP0 pid=442) DEBUG 04-22 00:20:38 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP2 pid=444) DEBUG 04-22 00:20:38 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=443) DEBUG 04-22 00:20:38 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP3 pid=445) DEBUG 04-22 00:20:38 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=243) DEBUG 04-22 00:20:38 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=243) DEBUG 04-22 00:20:38 [v1/engine/core.py:1158] EngineCore waiting for work. +(EngineCore pid=243) DEBUG 04-22 00:20:38 [v1/engine/core.py:1158] EngineCore waiting for work. +(APIServer pid=1) INFO 04-22 00:20:38 [entrypoints/openai/api_server.py:590] Supported tasks: ['generate'] +(APIServer pid=1) WARNING 04-22 00:20:39 [config/model.py:1435] Default vLLM sampling parameters have been overridden by the model's `generation_config.json`: `{'temperature': 0.6, 'top_p': 0.9}`. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`. +(APIServer pid=1) DEBUG 04-22 00:20:39 [renderers/base.py:197] Warming up chat template processing... +(APIServer pid=1) DEBUG 04-22 00:20:39 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e81457-26a4c5e34aa79af7740f7281;feab3de1-de34-4d52-bd22-8ad50e42a3d4) +(APIServer pid=1) DEBUG 04-22 00:20:39 [transformers_utils/repo_utils.py:243] +(APIServer pid=1) DEBUG 04-22 00:20:39 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic/resolve/main/processor_config.json. +(APIServer pid=1) DEBUG 04-22 00:20:39 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e81457-1ed4e7002e0a45454ba03fe8;1cdc5161-22f2-439d-9115-dff957aff247) +(APIServer pid=1) DEBUG 04-22 00:20:39 [transformers_utils/repo_utils.py:243] +(APIServer pid=1) DEBUG 04-22 00:20:39 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic/resolve/main/preprocessor_config.json. +(Worker_TP2 pid=444) DEBUG 04-22 00:20:39 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=442) DEBUG 04-22 00:20:39 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP3 pid=445) DEBUG 04-22 00:20:39 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=443) DEBUG 04-22 00:20:39 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(APIServer pid=1) INFO 04-22 00:20:40 [renderers/hf.py:314] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this. +(APIServer pid=1) DEBUG 04-22 00:20:40 [renderers/base.py:203] Chat template warmup completed in 1.352s +(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/openai/api_server.py:594] Starting vLLM server on http://0.0.0.0:8000 +(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:37] Available routes are: +(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /openapi.json, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /docs, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /docs/oauth2-redirect, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /redoc, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /tokenize, Methods: POST +(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /detokenize, Methods: POST +(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /load, Methods: GET +(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /version, Methods: GET +(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /health, Methods: GET +(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /metrics, Methods: GET +(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /v1/models, Methods: GET +(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /ping, Methods: GET +(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /ping, Methods: POST +(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /invocations, Methods: POST +(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /v1/chat/completions, Methods: POST +(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST +(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /v1/responses, Methods: POST +(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET +(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST +(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /v1/completions, Methods: POST +(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /v1/messages, Methods: POST +(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST +(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /inference/v1/generate, Methods: POST +(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /scale_elastic_ep, Methods: POST +(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST +(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /v1/chat/completions/render, Methods: POST +(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /v1/completions/render, Methods: POST +(APIServer pid=1) INFO: Started server process [1] +(APIServer pid=1) INFO: Waiting for application startup. +(APIServer pid=1) INFO: Application startup complete. +(APIServer pid=1) DEBUG 04-22 00:20:41 [v1/engine/async_llm.py:875] Called check_health. +(APIServer pid=1) INFO: 10.128.4.2:55156 - "GET /health HTTP/1.1" 200 OK diff --git a/accuracy/results/v0.19.0/logs/fp8dyn-redhatai-qwen2-5---h100-80gb--tp1pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/fp8dyn-redhatai-qwen2-5---h100-80gb--tp1pp1dp1--8192.log new file mode 100644 index 00000000..d62dbe3d --- /dev/null +++ b/accuracy/results/v0.19.0/logs/fp8dyn-redhatai-qwen2-5---h100-80gb--tp1pp1dp1--8192.log @@ -0,0 +1,864 @@ +DEBUG 04-22 00:22:13 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:22:13 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:22:13 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:22:13 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:22:13 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:22:13 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:22:13 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:22:13 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:22:13 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:22:13 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:22:13 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:22:13 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:22:18 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 00:22:20 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' +DEBUG 04-22 00:22:20 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 00:22:20 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:22:20 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:22:20 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 00:22:20 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:22:20 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 00:22:20 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 00:22:20 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model RedHatAI/Qwen2.5-7B-Instruct-fp8-dynamic +(APIServer pid=1) INFO 04-22 00:22:20 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 00:22:20 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:22:20 [entrypoints/utils.py:233] non-default args: {'model_tag': 'RedHatAI/Qwen2.5-7B-Instruct-fp8-dynamic', 'model': 'RedHatAI/Qwen2.5-7B-Instruct-fp8-dynamic', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 00:22:20 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 00:22:20 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen2.Qwen2ForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 00:22:20 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0004829 secs +(APIServer pid=1) INFO 04-22 00:22:20 [config/model.py:549] Resolved architecture: Qwen2ForCausalLM +(APIServer pid=1) INFO 04-22 00:22:20 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 00:22:21 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 00:22:21 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 00:22:21 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 00:22:21 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 00:22:21 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 00:22:21 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 00:22:21 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 00:22:21 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 00:22:21 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:22:21 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:22:21 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 00:22:25 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:22:25 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:22:25 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:22:25 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:22:25 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:22:25 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:22:25 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:22:25 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:22:25 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:22:25 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:22:25 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:22:25 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:22:30 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(APIServer pid=1) DEBUG 04-22 00:22:32 [v1/engine/utils.py:1042] Waiting for 1 local, 0 remote core engine proc(s) to connect. +(EngineCore pid=242) DEBUG 04-22 00:22:32 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 00:22:32 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=242) DEBUG 04-22 00:22:32 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/a0babe96-5d72-4573-be1a-3d5bf9f4178d'], outputs=['ipc:///tmp/14c08b43-219a-4301-a691-68700f8de8bb'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=242) DEBUG 04-22 00:22:32 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=242) DEBUG 04-22 00:22:32 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=242) DEBUG 04-22 00:22:32 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=242) DEBUG 04-22 00:22:32 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=242) DEBUG 04-22 00:22:32 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=242) INFO 04-22 00:22:32 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='RedHatAI/Qwen2.5-7B-Instruct-fp8-dynamic', speculative_config=None, tokenizer='RedHatAI/Qwen2.5-7B-Instruct-fp8-dynamic', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=RedHatAI/Qwen2.5-7B-Instruct-fp8-dynamic, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=242) DEBUG 04-22 00:22:32 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.4.192:39497 backend=nccl +(EngineCore pid=242) INFO 04-22 00:22:32 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.4.192:39497 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=242) DEBUG 04-22 00:22:32 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=242) INFO 04-22 00:22:32 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=242) DEBUG 04-22 00:22:32 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776817352.8785338, auto_measure=True +(EngineCore pid=242) DEBUG 04-22 00:22:32 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=242) DEBUG 04-22 00:22:32 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=242) DEBUG 04-22 00:22:32 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=242) DEBUG 04-22 00:22:32 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=242) DEBUG 04-22 00:22:33 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=242) DEBUG 04-22 00:22:33 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=242) DEBUG 04-22 00:22:33 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=242) INFO 04-22 00:22:33 [v1/worker/gpu_model_runner.py:4735] Starting to load model RedHatAI/Qwen2.5-7B-Instruct-fp8-dynamic... +(EngineCore pid=242) INFO 04-22 00:22:33 [model_executor/.../linear/__init__.py:261] Selected CutlassFP8ScaledMMLinearKernel for CompressedTensorsW8A8Fp8 +(EngineCore pid=242) INFO 04-22 00:22:33 [utils/deep_gemm.py:115] DeepGEMM E8M0 enabled on current platform. +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.self_attn.qkv_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.self_attn.o_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=242) INFO 04-22 00:22:33 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=242) INFO 04-22 00:22:33 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.mlp.gate_up_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.mlp.down_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.self_attn.qkv_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.self_attn.o_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.mlp.gate_up_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.mlp.down_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.self_attn.qkv_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.self_attn.o_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.mlp.gate_up_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.mlp.down_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.self_attn.qkv_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.self_attn.o_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.mlp.gate_up_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.mlp.down_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.self_attn.qkv_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.self_attn.o_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.mlp.gate_up_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.mlp.down_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.self_attn.qkv_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.self_attn.o_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.mlp.gate_up_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.mlp.down_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.self_attn.qkv_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.self_attn.o_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.mlp.gate_up_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.mlp.down_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.self_attn.qkv_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.self_attn.o_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.mlp.gate_up_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.mlp.down_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.self_attn.qkv_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.self_attn.o_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.mlp.gate_up_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.mlp.down_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.self_attn.qkv_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.self_attn.o_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.mlp.gate_up_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.mlp.down_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.self_attn.qkv_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.self_attn.o_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.mlp.gate_up_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.mlp.down_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.self_attn.qkv_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.self_attn.o_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.mlp.gate_up_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.mlp.down_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.self_attn.qkv_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.self_attn.o_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.mlp.gate_up_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.mlp.down_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.self_attn.qkv_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.self_attn.o_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.mlp.gate_up_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.mlp.down_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.self_attn.qkv_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.self_attn.o_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.mlp.gate_up_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.mlp.down_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.self_attn.qkv_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.self_attn.o_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.mlp.gate_up_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.mlp.down_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.self_attn.qkv_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.self_attn.o_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.mlp.gate_up_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.mlp.down_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.self_attn.qkv_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.self_attn.o_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.mlp.gate_up_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.mlp.down_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.self_attn.qkv_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.self_attn.o_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.mlp.gate_up_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.mlp.down_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.self_attn.qkv_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.self_attn.o_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.mlp.gate_up_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.mlp.down_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.self_attn.qkv_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.self_attn.o_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.mlp.gate_up_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.mlp.down_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.self_attn.qkv_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.self_attn.o_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.mlp.gate_up_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.mlp.down_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.self_attn.qkv_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.self_attn.o_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.mlp.gate_up_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.mlp.down_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.self_attn.qkv_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.self_attn.o_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.mlp.gate_up_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.mlp.down_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.self_attn.qkv_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.self_attn.o_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.mlp.gate_up_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.mlp.down_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.self_attn.qkv_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.self_attn.o_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.mlp.gate_up_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.mlp.down_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.self_attn.qkv_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.self_attn.o_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.mlp.gate_up_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.mlp.down_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.self_attn.qkv_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.self_attn.o_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.mlp.gate_up_proj +(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.mlp.down_proj +(EngineCore pid=242) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=242) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=242) DEBUG 04-22 00:22:34 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=242) DEBUG 04-22 00:22:34 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=242) DEBUG 04-22 00:22:34 [config/compilation.py:1195] disabled custom ops: Counter({'quant_fp8': 112, 'rms_norm': 57, 'silu_and_mul': 28, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=242) DEBUG 04-22 00:22:34 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=242) DEBUG 04-22 00:22:34 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00002.safetensors', 'model-00001-of-00002.safetensors']] +(EngineCore pid=242) Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00 +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/input_quant_fp8.py +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/utils/quant_utils.py +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=242) INFO 04-22 00:22:49 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/71f6b61ae1/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=a0449d9b50 comp=e546579c48 code=850d7e0a2e6ea6d15823f3dba6f5b3cc98fdb412a0d3254bb67dce9fa1730fd3 dir=/data/.cache/vllm/torch_compile_cache/71f6b61ae1/rank_0_0/backbone +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] Vllm config hash: a0449d9b50 +(EngineCore pid=242) INFO 04-22 00:22:49 [compilation/backends.py:1111] Dynamo bytecode transform time: 6.75 s +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=242) DEBUG 04-22 00:22:50 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=242) DEBUG 04-22 00:22:50 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(EngineCore pid=242) DEBUG 04-22 00:22:50 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.0 ms +(EngineCore pid=242) DEBUG 04-22 00:22:50 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=242) DEBUG 04-22 00:22:50 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(APIServer pid=1) DEBUG 04-22 00:22:52 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=242) INFO 04-22 00:22:52 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=242) DEBUG 04-22 00:22:52 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/71f6b61ae1/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=242) DEBUG 04-22 00:22:53 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=242) DEBUG 04-22 00:22:53 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(EngineCore pid=242) DEBUG 04-22 00:22:53 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.8 ms +(EngineCore pid=242) DEBUG 04-22 00:22:53 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=242) DEBUG 04-22 00:22:53 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(EngineCore pid=242) DEBUG 04-22 00:22:55 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/71f6b61ae1/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=242) DEBUG 04-22 00:22:56 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=242) DEBUG 04-22 00:22:56 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.3 ms +(EngineCore pid=242) DEBUG 04-22 00:22:56 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.5 ms +(EngineCore pid=242) DEBUG 04-22 00:22:56 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=242) DEBUG 04-22 00:22:56 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(EngineCore pid=242) DEBUG 04-22 00:22:56 [compilation/backends.py:377] Store the 28-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_28', '/data/.cache/vllm/torch_compile_cache/71f6b61ae1/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_28') +(EngineCore pid=242) INFO 04-22 00:22:56 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 7.24 s +(EngineCore pid=242) DEBUG 04-22 00:22:57 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/71f6b61ae1/rank_0_0/backbone/computation_graph.py +(EngineCore pid=242) INFO 04-22 00:22:58 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/f4154a96ec3c2b706aaa7f98465c13eb99158e9860b8e6dad729e3b5d6402e64/rank_0_0/model +(EngineCore pid=242) INFO 04-22 00:22:58 [compilation/monitor.py:48] torch.compile took 16.09 s in total +(EngineCore pid=242) INFO 04-22 00:22:59 [compilation/monitor.py:76] Initial profiling/warmup run took 0.33 s +(APIServer pid=1) DEBUG 04-22 00:23:02 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=242) INFO 04-22 00:23:04 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=242) DEBUG 04-22 00:23:04 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=242) INFO 04-22 00:23:04 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=242) DEBUG 04-22 00:23:04 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=242) DEBUG 04-22 00:23:04 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=242) DEBUG 04-22 00:23:04 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=242) DEBUG 04-22 00:23:04 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=242) DEBUG 04-22 00:23:04 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=242) DEBUG 04-22 00:23:04 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=242) DEBUG 04-22 00:23:04 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 108.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=242) DEBUG 04-22 00:23:04 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=242) DEBUG 04-22 00:23:04 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=242) DEBUG 04-22 00:23:04 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=242) DEBUG 04-22 00:23:05 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=242) DEBUG 04-22 00:23:05 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=242) DEBUG 04-22 00:23:05 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=242) DEBUG 04-22 00:23:05 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 228.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(EngineCore pid=242) DEBUG 04-22 00:23:05 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=242) INFO 04-22 00:23:05 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.71 GiB total +(EngineCore pid=242) DEBUG 04-22 00:23:05 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=242) DEBUG 04-22 00:23:05 [v1/worker/gpu_worker.py:430] Free memory after profiling: 69.06 GiB (total), 65.62 GiB (within requested) +(EngineCore pid=242) DEBUG 04-22 00:23:05 [v1/worker/gpu_worker.py:435] Memory profiling takes 23.23 seconds. Total non KV cache memory: 10.59GiB; torch peak memory increase: 2.21GiB; non-torch forward increase memory: 0.24GiB; weights memory: 8.14GiB. +(EngineCore pid=242) INFO 04-22 00:23:05 [v1/worker/gpu_worker.py:436] Available KV cache memory: 64.64 GiB +(EngineCore pid=242) INFO 04-22 00:23:05 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9590 to maintain the same effective KV cache size. +(EngineCore pid=242) INFO 04-22 00:23:05 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 1,210,304 tokens +(EngineCore pid=242) INFO 04-22 00:23:05 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 147.74x +(EngineCore pid=242) 2026-04-22 00:23:05,885 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=242) DEBUG 04-22 00:23:05 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=242) 2026-04-22 00:23:05,894 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=242) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 15:50:06 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 15:50:06 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 15:50:06 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 15:50:06 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 15:50:06 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 15:50:06 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model google/gemma-2-27b-it +(APIServer pid=1) INFO 04-22 15:50:06 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 15:50:06 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 15:50:06 [entrypoints/utils.py:233] non-default args: {'model_tag': 'google/gemma-2-27b-it', 'model': 'google/gemma-2-27b-it', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 15:50:06 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) Traceback (most recent call last): +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_http.py", line 403, in hf_raise_for_status +(APIServer pid=1) response.raise_for_status() +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/requests/models.py", line 1028, in raise_for_status +(APIServer pid=1) raise HTTPError(http_error_msg, response=self) +(APIServer pid=1) requests.exceptions.HTTPError: 403 Client Error: Forbidden for url: https://huggingface.co/google/gemma-2-27b-it/resolve/main/config.json +(APIServer pid=1) +(APIServer pid=1) The above exception was the direct cause of the following exception: +(APIServer pid=1) +(APIServer pid=1) Traceback (most recent call last): +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1550, in _get_metadata_or_catch_error +(APIServer pid=1) metadata = get_hf_file_metadata( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn +(APIServer pid=1) return fn(*args, **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1467, in get_hf_file_metadata +(APIServer pid=1) r = _request_wrapper( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 283, in _request_wrapper +(APIServer pid=1) response = _request_wrapper( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 307, in _request_wrapper +(APIServer pid=1) hf_raise_for_status(response) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_http.py", line 467, in hf_raise_for_status +(APIServer pid=1) raise _format(HfHubHTTPError, message, response) from e +(APIServer pid=1) huggingface_hub.errors.HfHubHTTPError: (Request ID: Root=1-69e8ee2e-545ab1ad15853b3475ad62dd;fd90b524-9997-45ee-982d-17bb6e249315) +(APIServer pid=1) +(APIServer pid=1) 403 Forbidden: Please enable access to public gated repositories in your fine-grained token settings to view this repository.. +(APIServer pid=1) Cannot access content at: https://huggingface.co/google/gemma-2-27b-it/resolve/main/config.json. +(APIServer pid=1) Make sure your token has the correct permissions. +(APIServer pid=1) +(APIServer pid=1) The above exception was the direct cause of the following exception: +(APIServer pid=1) +(APIServer pid=1) Traceback (most recent call last): +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 479, in cached_files +(APIServer pid=1) hf_hub_download( +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn +(APIServer pid=1) return fn(*args, **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1014, in hf_hub_download +(APIServer pid=1) return _hf_hub_download_to_cache_dir( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1121, in _hf_hub_download_to_cache_dir +(APIServer pid=1) _raise_on_head_call_error(head_call_error, force_download, local_files_only) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1665, in _raise_on_head_call_error +(APIServer pid=1) raise LocalEntryNotFoundError( +(APIServer pid=1) huggingface_hub.errors.LocalEntryNotFoundError: An error happened while trying to locate the file on the Hub and we cannot find the requested files in the local cache. Please check your connection and try again or make sure your Internet connection is on. +(APIServer pid=1) +(APIServer pid=1) The above exception was the direct cause of the following exception: +(APIServer pid=1) +(APIServer pid=1) Traceback (most recent call last): +(APIServer pid=1) File "/usr/local/bin/vllm", line 10, in +(APIServer pid=1) sys.exit(main()) +(APIServer pid=1) ^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main +(APIServer pid=1) args.dispatch_function(args) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd +(APIServer pid=1) uvloop.run(run_server(args)) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run +(APIServer pid=1) return __asyncio.run( +(APIServer pid=1) ^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run +(APIServer pid=1) return runner.run(main) +(APIServer pid=1) ^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run +(APIServer pid=1) return self._loop.run_until_complete(task) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper +(APIServer pid=1) return await main +(APIServer pid=1) ^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server +(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker +(APIServer pid=1) async with build_async_engine_client( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ +(APIServer pid=1) return await anext(self.gen) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client +(APIServer pid=1) async with build_async_engine_client_from_engine_args( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ +(APIServer pid=1) return await anext(self.gen) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 124, in build_async_engine_client_from_engine_args +(APIServer pid=1) vllm_config = engine_args.create_engine_config(usage_context=usage_context) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/engine/arg_utils.py", line 1539, in create_engine_config +(APIServer pid=1) maybe_override_with_speculators( +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/transformers_utils/config.py", line 583, in maybe_override_with_speculators +(APIServer pid=1) config_dict, _ = PretrainedConfig.get_config_dict( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py", line 662, in get_config_dict +(APIServer pid=1) config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py", line 721, in _get_config_dict +(APIServer pid=1) resolved_config_file = cached_file( +(APIServer pid=1) ^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 322, in cached_file +(APIServer pid=1) file = cached_files(path_or_repo_id=path_or_repo_id, filenames=[filename], **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 553, in cached_files +(APIServer pid=1) raise OSError( +(APIServer pid=1) OSError: We couldn't connect to 'https://huggingface.co' to load the files, and couldn't find them in the cached files. +(APIServer pid=1) Check your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'. diff --git a/accuracy/results/v0.19.0/logs/google-gemma-2-27b-it--h100-80gb--tp1pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/google-gemma-2-27b-it--h100-80gb--tp1pp1dp1--8192.log new file mode 100644 index 00000000..c955da1e --- /dev/null +++ b/accuracy/results/v0.19.0/logs/google-gemma-2-27b-it--h100-80gb--tp1pp1dp1--8192.log @@ -0,0 +1,770 @@ +DEBUG 04-22 15:58:01 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 15:58:01 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 15:58:01 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 15:58:01 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 15:58:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 15:58:01 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 15:58:01 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 15:58:01 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 15:58:01 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 15:58:01 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 15:58:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 15:58:01 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 15:58:05 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 15:58:07 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' +DEBUG 04-22 15:58:07 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 15:58:07 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 15:58:07 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 15:58:07 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 15:58:07 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 15:58:07 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 15:58:07 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 15:58:07 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model google/gemma-2-27b-it +(APIServer pid=1) INFO 04-22 15:58:07 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 15:58:07 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 15:58:07 [entrypoints/utils.py:233] non-default args: {'model_tag': 'google/gemma-2-27b-it', 'model': 'google/gemma-2-27b-it', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 15:58:07 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 15:58:08 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.gemma2.Gemma2ForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 15:58:08 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003551 secs +(APIServer pid=1) INFO 04-22 15:58:08 [config/model.py:549] Resolved architecture: Gemma2ForCausalLM +(APIServer pid=1) INFO 04-22 15:58:08 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 15:58:08 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 15:58:08 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 15:58:08 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 15:58:08 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 15:58:08 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 15:58:08 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 15:58:08 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 15:58:08 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 15:58:08 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 15:58:10 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 15:58:10 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 15:58:14 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 15:58:14 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 15:58:14 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 15:58:14 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 15:58:14 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 15:58:14 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 15:58:14 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 15:58:14 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 15:58:14 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 15:58:14 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 15:58:14 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 15:58:14 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 15:58:19 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=286) DEBUG 04-22 15:58:20 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 15:58:20 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=286) DEBUG 04-22 15:58:20 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/e4f62627-5c43-4e86-9af9-f2733a970dd3'], outputs=['ipc:///tmp/5010f7f7-7c1c-4ad2-ae08-9526b73a064c'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=286) DEBUG 04-22 15:58:20 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=286) DEBUG 04-22 15:58:20 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=286) DEBUG 04-22 15:58:20 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=286) DEBUG 04-22 15:58:20 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=286) DEBUG 04-22 15:58:20 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=286) INFO 04-22 15:58:20 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='google/gemma-2-27b-it', speculative_config=None, tokenizer='google/gemma-2-27b-it', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=google/gemma-2-27b-it, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=286) DEBUG 04-22 15:58:21 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.10.216:57701 backend=nccl +(EngineCore pid=286) INFO 04-22 15:58:21 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.10.216:57701 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=286) DEBUG 04-22 15:58:21 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=286) INFO 04-22 15:58:21 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=286) DEBUG 04-22 15:58:21 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776873501.6531677, auto_measure=True +(EngineCore pid=286) DEBUG 04-22 15:58:21 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=286) DEBUG 04-22 15:58:21 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=286) DEBUG 04-22 15:58:21 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=286) DEBUG 04-22 15:58:21 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=286) DEBUG 04-22 15:58:21 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=286) DEBUG 04-22 15:58:21 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=286) DEBUG 04-22 15:58:21 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=286) DEBUG 04-22 15:58:21 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=286) DEBUG 04-22 15:58:21 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=286) INFO 04-22 15:58:21 [v1/worker/gpu_model_runner.py:4735] Starting to load model google/gemma-2-27b-it... +(EngineCore pid=286) DEBUG 04-22 15:58:22 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=286) INFO 04-22 15:58:22 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=286) INFO 04-22 15:58:22 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=286) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=286) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=286) DEBUG 04-22 15:58:22 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=286) DEBUG 04-22 15:58:22 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=286) DEBUG 04-22 15:58:22 [config/compilation.py:1195] disabled custom ops: Counter({'gemma_rms_norm': 185, 'gelu_and_mul': 46, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'logits_processor': 1}) +(EngineCore pid=286) DEBUG 04-22 15:58:22 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=286) DEBUG 04-22 15:58:22 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00011-of-00012.safetensors', 'model-00003-of-00012.safetensors', 'model-00006-of-00012.safetensors', 'model-00001-of-00012.safetensors', 'model-00009-of-00012.safetensors', 'model-00002-of-00012.safetensors', 'model-00010-of-00012.safetensors', 'model-00012-of-00012.safetensors', 'model-00008-of-00012.safetensors', 'model-00005-of-00012.safetensors', 'model-00004-of-00012.safetensors', 'model-00007-of-00012.safetensors']] +(APIServer pid=1) DEBUG 04-22 15:58:30 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 15:58:40 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 15:58:50 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 15:59:00 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 15:59:10 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 15:59:20 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 15:59:30 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 15:59:40 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 15:59:50 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 16:00:00 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 16:00:10 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 16:00:20 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=286) INFO 04-22 16:00:22 [model_executor/model_loader/weight_utils.py:581] Time spent downloading weights for google/gemma-2-27b-it: 120.123986 seconds +(EngineCore pid=286) Loading safetensors checkpoint shards: 0% Completed | 0/12 [00:00 +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/gemma2.py +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=286) INFO 04-22 16:01:17 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/8070fc1ef5/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=a866ad9f99 comp=e546579c48 code=1effc1a117077c34d7781a48a74d1e9a6c34076dbd4b7937695d68b6a3d23a2c dir=/data/.cache/vllm/torch_compile_cache/8070fc1ef5/rank_0_0/backbone +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] Vllm config hash: a866ad9f99 +(EngineCore pid=286) INFO 04-22 16:01:17 [compilation/backends.py:1111] Dynamo bytecode transform time: 6.19 s +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=286) DEBUG 04-22 16:01:18 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=286) DEBUG 04-22 16:01:18 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms +(EngineCore pid=286) DEBUG 04-22 16:01:18 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms +(EngineCore pid=286) DEBUG 04-22 16:01:18 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=286) DEBUG 04-22 16:01:18 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=286) INFO 04-22 16:01:20 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=286) DEBUG 04-22 16:01:20 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/8070fc1ef5/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(APIServer pid=1) DEBUG 04-22 16:01:20 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=286) DEBUG 04-22 16:01:21 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=286) DEBUG 04-22 16:01:21 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(EngineCore pid=286) DEBUG 04-22 16:01:21 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.7 ms +(EngineCore pid=286) DEBUG 04-22 16:01:21 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=286) DEBUG 04-22 16:01:21 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=286) DEBUG 04-22 16:01:22 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/8070fc1ef5/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=286) DEBUG 04-22 16:01:23 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=286) DEBUG 04-22 16:01:23 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.4 ms +(EngineCore pid=286) DEBUG 04-22 16:01:23 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.5 ms +(EngineCore pid=286) DEBUG 04-22 16:01:23 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=286) DEBUG 04-22 16:01:23 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(EngineCore pid=286) DEBUG 04-22 16:01:24 [compilation/backends.py:377] Store the 46-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_46', '/data/.cache/vllm/torch_compile_cache/8070fc1ef5/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_46') +(EngineCore pid=286) INFO 04-22 16:01:24 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 6.31 s +(EngineCore pid=286) DEBUG 04-22 16:01:24 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/8070fc1ef5/rank_0_0/backbone/computation_graph.py +(EngineCore pid=286) INFO 04-22 16:01:26 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/eb9d3a1e33941e4c476db9ae83727bbefc20c4ff28627d8fa1c23be91a2ee24c/rank_0_0/model +(EngineCore pid=286) INFO 04-22 16:01:26 [compilation/monitor.py:48] torch.compile took 14.67 s in total +(EngineCore pid=286) INFO 04-22 16:01:26 [compilation/monitor.py:76] Initial profiling/warmup run took 0.28 s +(APIServer pid=1) DEBUG 04-22 16:01:30 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=286) INFO 04-22 16:01:32 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=286) DEBUG 04-22 16:01:32 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=286) INFO 04-22 16:01:32 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=286) DEBUG 04-22 16:01:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=286) DEBUG 04-22 16:01:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=286) DEBUG 04-22 16:01:32 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=286) DEBUG 04-22 16:01:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=286) DEBUG 04-22 16:01:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=286) DEBUG 04-22 16:01:32 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=286) DEBUG 04-22 16:01:32 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 212.00 MiB first-capture + (51-1) × 10.00 MiB per-graph +(EngineCore pid=286) DEBUG 04-22 16:01:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=286) DEBUG 04-22 16:01:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=286) DEBUG 04-22 16:01:32 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=286) DEBUG 04-22 16:01:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=286) DEBUG 04-22 16:01:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=286) DEBUG 04-22 16:01:32 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=286) DEBUG 04-22 16:01:32 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 262.00 MiB first-capture + (51-1) × 8.00 MiB per-graph +(EngineCore pid=286) DEBUG 04-22 16:01:33 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=286) INFO 04-22 16:01:33 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.13 GiB total +(EngineCore pid=286) DEBUG 04-22 16:01:33 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=286) DEBUG 04-22 16:01:33 [v1/worker/gpu_worker.py:430] Free memory after profiling: 25.35 GiB (total), 21.91 GiB (within requested) +(EngineCore pid=286) DEBUG 04-22 16:01:33 [v1/worker/gpu_worker.py:435] Memory profiling takes 22.38 seconds. Total non KV cache memory: 54.64GiB; torch peak memory increase: 3.66GiB; non-torch forward increase memory: 0.26GiB; weights memory: 50.72GiB. +(EngineCore pid=286) INFO 04-22 16:01:33 [v1/worker/gpu_worker.py:436] Available KV cache memory: 20.59 GiB +(EngineCore pid=286) INFO 04-22 16:01:33 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9643 to maintain the same effective KV cache size. +(EngineCore pid=286) INFO 04-22 16:01:33 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 58,672 tokens +(EngineCore pid=286) INFO 04-22 16:01:33 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 7.16x +(EngineCore pid=286) 2026-04-22 16:01:33,734 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=286) DEBUG 04-22 16:01:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=286) 2026-04-22 16:01:33,745 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=286) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 15:49:40 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 15:49:40 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 15:49:40 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 15:49:40 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 15:49:40 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 15:49:40 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model google/gemma-2-2b-it +(APIServer pid=1) INFO 04-22 15:49:40 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 15:49:40 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 15:49:40 [entrypoints/utils.py:233] non-default args: {'model_tag': 'google/gemma-2-2b-it', 'model': 'google/gemma-2-2b-it', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 15:49:40 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) Traceback (most recent call last): +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_http.py", line 403, in hf_raise_for_status +(APIServer pid=1) response.raise_for_status() +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/requests/models.py", line 1028, in raise_for_status +(APIServer pid=1) raise HTTPError(http_error_msg, response=self) +(APIServer pid=1) requests.exceptions.HTTPError: 403 Client Error: Forbidden for url: https://huggingface.co/google/gemma-2-2b-it/resolve/main/config.json +(APIServer pid=1) +(APIServer pid=1) The above exception was the direct cause of the following exception: +(APIServer pid=1) +(APIServer pid=1) Traceback (most recent call last): +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1550, in _get_metadata_or_catch_error +(APIServer pid=1) metadata = get_hf_file_metadata( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn +(APIServer pid=1) return fn(*args, **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1467, in get_hf_file_metadata +(APIServer pid=1) r = _request_wrapper( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 283, in _request_wrapper +(APIServer pid=1) response = _request_wrapper( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 307, in _request_wrapper +(APIServer pid=1) hf_raise_for_status(response) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_http.py", line 467, in hf_raise_for_status +(APIServer pid=1) raise _format(HfHubHTTPError, message, response) from e +(APIServer pid=1) huggingface_hub.errors.HfHubHTTPError: (Request ID: Root=1-69e8ee14-22d7cb9163602c5941ebd21d;57340d59-f182-4b7d-a919-4768efc8392f) +(APIServer pid=1) +(APIServer pid=1) 403 Forbidden: Please enable access to public gated repositories in your fine-grained token settings to view this repository.. +(APIServer pid=1) Cannot access content at: https://huggingface.co/google/gemma-2-2b-it/resolve/main/config.json. +(APIServer pid=1) Make sure your token has the correct permissions. +(APIServer pid=1) +(APIServer pid=1) The above exception was the direct cause of the following exception: +(APIServer pid=1) +(APIServer pid=1) Traceback (most recent call last): +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 479, in cached_files +(APIServer pid=1) hf_hub_download( +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn +(APIServer pid=1) return fn(*args, **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1014, in hf_hub_download +(APIServer pid=1) return _hf_hub_download_to_cache_dir( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1121, in _hf_hub_download_to_cache_dir +(APIServer pid=1) _raise_on_head_call_error(head_call_error, force_download, local_files_only) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1665, in _raise_on_head_call_error +(APIServer pid=1) raise LocalEntryNotFoundError( +(APIServer pid=1) huggingface_hub.errors.LocalEntryNotFoundError: An error happened while trying to locate the file on the Hub and we cannot find the requested files in the local cache. Please check your connection and try again or make sure your Internet connection is on. +(APIServer pid=1) +(APIServer pid=1) The above exception was the direct cause of the following exception: +(APIServer pid=1) +(APIServer pid=1) Traceback (most recent call last): +(APIServer pid=1) File "/usr/local/bin/vllm", line 10, in +(APIServer pid=1) sys.exit(main()) +(APIServer pid=1) ^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main +(APIServer pid=1) args.dispatch_function(args) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd +(APIServer pid=1) uvloop.run(run_server(args)) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run +(APIServer pid=1) return __asyncio.run( +(APIServer pid=1) ^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run +(APIServer pid=1) return runner.run(main) +(APIServer pid=1) ^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run +(APIServer pid=1) return self._loop.run_until_complete(task) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper +(APIServer pid=1) return await main +(APIServer pid=1) ^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server +(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker +(APIServer pid=1) async with build_async_engine_client( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ +(APIServer pid=1) return await anext(self.gen) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client +(APIServer pid=1) async with build_async_engine_client_from_engine_args( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ +(APIServer pid=1) return await anext(self.gen) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 124, in build_async_engine_client_from_engine_args +(APIServer pid=1) vllm_config = engine_args.create_engine_config(usage_context=usage_context) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/engine/arg_utils.py", line 1539, in create_engine_config +(APIServer pid=1) maybe_override_with_speculators( +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/transformers_utils/config.py", line 583, in maybe_override_with_speculators +(APIServer pid=1) config_dict, _ = PretrainedConfig.get_config_dict( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py", line 662, in get_config_dict +(APIServer pid=1) config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py", line 721, in _get_config_dict +(APIServer pid=1) resolved_config_file = cached_file( +(APIServer pid=1) ^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 322, in cached_file +(APIServer pid=1) file = cached_files(path_or_repo_id=path_or_repo_id, filenames=[filename], **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 553, in cached_files +(APIServer pid=1) raise OSError( +(APIServer pid=1) OSError: We couldn't connect to 'https://huggingface.co' to load the files, and couldn't find them in the cached files. +(APIServer pid=1) Check your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'. diff --git a/accuracy/results/v0.19.0/logs/google-gemma-2-2b-it--h100-80gb--tp1pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/google-gemma-2-2b-it--h100-80gb--tp1pp1dp1--8192.log new file mode 100644 index 00000000..4133bbe3 --- /dev/null +++ b/accuracy/results/v0.19.0/logs/google-gemma-2-2b-it--h100-80gb--tp1pp1dp1--8192.log @@ -0,0 +1,745 @@ +DEBUG 04-22 15:55:00 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 15:55:00 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 15:55:00 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 15:55:00 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 15:55:00 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 15:55:00 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 15:55:00 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 15:55:00 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 15:55:00 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 15:55:00 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 15:55:00 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 15:55:00 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 15:55:04 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 15:55:06 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' +DEBUG 04-22 15:55:06 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 15:55:06 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 15:55:06 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 15:55:06 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 15:55:06 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 15:55:06 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 15:55:06 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 15:55:06 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model google/gemma-2-2b-it +(APIServer pid=1) INFO 04-22 15:55:06 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 15:55:06 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 15:55:06 [entrypoints/utils.py:233] non-default args: {'model_tag': 'google/gemma-2-2b-it', 'model': 'google/gemma-2-2b-it', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 15:55:06 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 15:55:07 [model_executor/models/registry.py:774] Cached model info file for class vllm.model_executor.models.gemma2.Gemma2ForCausalLM not found +(APIServer pid=1) DEBUG 04-22 15:55:07 [model_executor/models/registry.py:834] Cache model info for class vllm.model_executor.models.gemma2.Gemma2ForCausalLM miss. Loading model instead. +(APIServer pid=1) DEBUG 04-22 15:55:16 [model_executor/models/registry.py:844] Loaded model info for class vllm.model_executor.models.gemma2.Gemma2ForCausalLM +(APIServer pid=1) DEBUG 04-22 15:55:16 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 9.7262713 secs +(APIServer pid=1) INFO 04-22 15:55:16 [config/model.py:549] Resolved architecture: Gemma2ForCausalLM +(APIServer pid=1) INFO 04-22 15:55:16 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 15:55:16 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 15:55:16 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 15:55:16 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 15:55:16 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 15:55:16 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 15:55:16 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 15:55:16 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 15:55:16 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 15:55:16 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 15:55:19 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 15:55:19 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 15:55:22 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 15:55:22 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 15:55:22 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 15:55:22 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 15:55:22 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 15:55:22 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 15:55:22 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 15:55:22 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 15:55:22 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 15:55:22 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 15:55:22 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 15:55:22 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 15:55:27 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=474) DEBUG 04-22 15:55:29 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 15:55:29 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=474) DEBUG 04-22 15:55:29 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/7d536ea1-41ff-4939-a7ea-83235282434d'], outputs=['ipc:///tmp/13c3abbe-776f-4b4b-8481-36eeaa00e050'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=474) DEBUG 04-22 15:55:29 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=474) DEBUG 04-22 15:55:29 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=474) DEBUG 04-22 15:55:29 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=474) DEBUG 04-22 15:55:29 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=474) DEBUG 04-22 15:55:29 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=474) INFO 04-22 15:55:29 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='google/gemma-2-2b-it', speculative_config=None, tokenizer='google/gemma-2-2b-it', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=google/gemma-2-2b-it, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=474) DEBUG 04-22 15:55:29 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.10.209:49295 backend=nccl +(EngineCore pid=474) INFO 04-22 15:55:29 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.10.209:49295 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=474) DEBUG 04-22 15:55:30 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=474) INFO 04-22 15:55:30 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=474) DEBUG 04-22 15:55:30 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776873330.4498482, auto_measure=True +(EngineCore pid=474) DEBUG 04-22 15:55:30 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=474) DEBUG 04-22 15:55:30 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=474) DEBUG 04-22 15:55:30 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=474) DEBUG 04-22 15:55:30 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=474) DEBUG 04-22 15:55:30 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=474) DEBUG 04-22 15:55:30 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=474) DEBUG 04-22 15:55:30 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=474) DEBUG 04-22 15:55:30 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=474) DEBUG 04-22 15:55:30 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=474) INFO 04-22 15:55:30 [v1/worker/gpu_model_runner.py:4735] Starting to load model google/gemma-2-2b-it... +(EngineCore pid=474) DEBUG 04-22 15:55:31 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=256, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=474) INFO 04-22 15:55:31 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=474) INFO 04-22 15:55:31 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=474) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=474) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=474) DEBUG 04-22 15:55:31 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=474) DEBUG 04-22 15:55:31 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=474) DEBUG 04-22 15:55:31 [config/compilation.py:1195] disabled custom ops: Counter({'gemma_rms_norm': 105, 'gelu_and_mul': 26, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'logits_processor': 1}) +(EngineCore pid=474) DEBUG 04-22 15:55:31 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=474) DEBUG 04-22 15:55:31 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00002.safetensors', 'model-00001-of-00002.safetensors']] +(APIServer pid=1) DEBUG 04-22 15:55:39 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=474) INFO 04-22 15:55:40 [model_executor/model_loader/weight_utils.py:581] Time spent downloading weights for google/gemma-2-2b-it: 8.440384 seconds +(EngineCore pid=474) Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00 +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/gemma2.py +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=474) INFO 04-22 15:55:49 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/92cf196b29/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=9e1fbf4885 comp=e546579c48 code=1effc1a117077c34d7781a48a74d1e9a6c34076dbd4b7937695d68b6a3d23a2c dir=/data/.cache/vllm/torch_compile_cache/92cf196b29/rank_0_0/backbone +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] Vllm config hash: 9e1fbf4885 +(EngineCore pid=474) INFO 04-22 15:55:49 [compilation/backends.py:1111] Dynamo bytecode transform time: 3.85 s +(APIServer pid=1) DEBUG 04-22 15:55:49 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=474) DEBUG 04-22 15:55:50 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=474) DEBUG 04-22 15:55:50 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms +(EngineCore pid=474) DEBUG 04-22 15:55:50 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.0 ms +(EngineCore pid=474) DEBUG 04-22 15:55:50 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=474) DEBUG 04-22 15:55:50 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=474) INFO 04-22 15:55:51 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=474) DEBUG 04-22 15:55:51 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/92cf196b29/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=474) DEBUG 04-22 15:55:52 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=474) DEBUG 04-22 15:55:52 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(EngineCore pid=474) DEBUG 04-22 15:55:52 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.4 ms +(EngineCore pid=474) DEBUG 04-22 15:55:52 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=474) DEBUG 04-22 15:55:52 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=474) DEBUG 04-22 15:55:54 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/92cf196b29/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=474) DEBUG 04-22 15:55:54 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=474) DEBUG 04-22 15:55:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.2 ms +(EngineCore pid=474) DEBUG 04-22 15:55:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.3 ms +(EngineCore pid=474) DEBUG 04-22 15:55:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=474) DEBUG 04-22 15:55:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(EngineCore pid=474) DEBUG 04-22 15:55:55 [compilation/backends.py:377] Store the 26-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_26', '/data/.cache/vllm/torch_compile_cache/92cf196b29/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_26') +(EngineCore pid=474) INFO 04-22 15:55:55 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.97 s +(EngineCore pid=474) DEBUG 04-22 15:55:55 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/92cf196b29/rank_0_0/backbone/computation_graph.py +(EngineCore pid=474) INFO 04-22 15:55:56 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/1d940999550d016d5904ae05d31d0ec822db5c3ea1c63a9eedf126a4d6b58248/rank_0_0/model +(EngineCore pid=474) INFO 04-22 15:55:56 [compilation/monitor.py:48] torch.compile took 11.46 s in total +(EngineCore pid=474) INFO 04-22 15:55:57 [compilation/monitor.py:76] Initial profiling/warmup run took 0.28 s +(APIServer pid=1) DEBUG 04-22 15:55:59 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=474) INFO 04-22 15:56:02 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=474) DEBUG 04-22 15:56:02 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=474) INFO 04-22 15:56:02 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=474) DEBUG 04-22 15:56:03 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=474) DEBUG 04-22 15:56:03 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=474) DEBUG 04-22 15:56:03 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=474) DEBUG 04-22 15:56:03 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=474) DEBUG 04-22 15:56:03 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=474) DEBUG 04-22 15:56:03 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=474) DEBUG 04-22 15:56:03 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 118.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(EngineCore pid=474) DEBUG 04-22 15:56:03 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=474) DEBUG 04-22 15:56:03 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=474) DEBUG 04-22 15:56:03 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=474) DEBUG 04-22 15:56:03 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=474) DEBUG 04-22 15:56:03 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=474) DEBUG 04-22 15:56:03 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=474) DEBUG 04-22 15:56:03 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 134.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(EngineCore pid=474) DEBUG 04-22 15:56:03 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=474) INFO 04-22 15:56:03 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.52 GiB total +(EngineCore pid=474) DEBUG 04-22 15:56:03 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=474) DEBUG 04-22 15:56:03 [v1/worker/gpu_worker.py:430] Free memory after profiling: 72.29 GiB (total), 68.84 GiB (within requested) +(EngineCore pid=474) DEBUG 04-22 15:56:03 [v1/worker/gpu_worker.py:435] Memory profiling takes 18.59 seconds. Total non KV cache memory: 8.76GiB; torch peak memory increase: 3.62GiB; non-torch forward increase memory: 0.24GiB; weights memory: 4.9GiB. +(EngineCore pid=474) INFO 04-22 15:56:03 [v1/worker/gpu_worker.py:436] Available KV cache memory: 66.47 GiB +(EngineCore pid=474) INFO 04-22 15:56:03 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9566 to maintain the same effective KV cache size. +(EngineCore pid=474) INFO 04-22 15:56:03 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 670,160 tokens +(EngineCore pid=474) INFO 04-22 15:56:03 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 81.73x +(EngineCore pid=474) 2026-04-22 15:56:03,910 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=474) DEBUG 04-22 15:56:03 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=474) 2026-04-22 15:56:03,916 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=474) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 15:49:53 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 15:49:53 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 15:49:53 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 15:49:53 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 15:49:53 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 15:49:53 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model google/gemma-2-9b-it +(APIServer pid=1) INFO 04-22 15:49:53 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 15:49:53 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 15:49:53 [entrypoints/utils.py:233] non-default args: {'model_tag': 'google/gemma-2-9b-it', 'model': 'google/gemma-2-9b-it', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 15:49:53 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) Traceback (most recent call last): +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_http.py", line 403, in hf_raise_for_status +(APIServer pid=1) response.raise_for_status() +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/requests/models.py", line 1028, in raise_for_status +(APIServer pid=1) raise HTTPError(http_error_msg, response=self) +(APIServer pid=1) requests.exceptions.HTTPError: 403 Client Error: Forbidden for url: https://huggingface.co/google/gemma-2-9b-it/resolve/main/config.json +(APIServer pid=1) +(APIServer pid=1) The above exception was the direct cause of the following exception: +(APIServer pid=1) +(APIServer pid=1) Traceback (most recent call last): +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1550, in _get_metadata_or_catch_error +(APIServer pid=1) metadata = get_hf_file_metadata( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn +(APIServer pid=1) return fn(*args, **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1467, in get_hf_file_metadata +(APIServer pid=1) r = _request_wrapper( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 283, in _request_wrapper +(APIServer pid=1) response = _request_wrapper( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 307, in _request_wrapper +(APIServer pid=1) hf_raise_for_status(response) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_http.py", line 467, in hf_raise_for_status +(APIServer pid=1) raise _format(HfHubHTTPError, message, response) from e +(APIServer pid=1) huggingface_hub.errors.HfHubHTTPError: (Request ID: Root=1-69e8ee21-227b87a308f241507904e4ee;d5a6ca02-00be-4c30-9f6f-bc2dc9bc5644) +(APIServer pid=1) +(APIServer pid=1) 403 Forbidden: Please enable access to public gated repositories in your fine-grained token settings to view this repository.. +(APIServer pid=1) Cannot access content at: https://huggingface.co/google/gemma-2-9b-it/resolve/main/config.json. +(APIServer pid=1) Make sure your token has the correct permissions. +(APIServer pid=1) +(APIServer pid=1) The above exception was the direct cause of the following exception: +(APIServer pid=1) +(APIServer pid=1) Traceback (most recent call last): +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 479, in cached_files +(APIServer pid=1) hf_hub_download( +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn +(APIServer pid=1) return fn(*args, **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1014, in hf_hub_download +(APIServer pid=1) return _hf_hub_download_to_cache_dir( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1121, in _hf_hub_download_to_cache_dir +(APIServer pid=1) _raise_on_head_call_error(head_call_error, force_download, local_files_only) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1665, in _raise_on_head_call_error +(APIServer pid=1) raise LocalEntryNotFoundError( +(APIServer pid=1) huggingface_hub.errors.LocalEntryNotFoundError: An error happened while trying to locate the file on the Hub and we cannot find the requested files in the local cache. Please check your connection and try again or make sure your Internet connection is on. +(APIServer pid=1) +(APIServer pid=1) The above exception was the direct cause of the following exception: +(APIServer pid=1) +(APIServer pid=1) Traceback (most recent call last): +(APIServer pid=1) File "/usr/local/bin/vllm", line 10, in +(APIServer pid=1) sys.exit(main()) +(APIServer pid=1) ^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main +(APIServer pid=1) args.dispatch_function(args) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd +(APIServer pid=1) uvloop.run(run_server(args)) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run +(APIServer pid=1) return __asyncio.run( +(APIServer pid=1) ^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run +(APIServer pid=1) return runner.run(main) +(APIServer pid=1) ^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run +(APIServer pid=1) return self._loop.run_until_complete(task) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper +(APIServer pid=1) return await main +(APIServer pid=1) ^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server +(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker +(APIServer pid=1) async with build_async_engine_client( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ +(APIServer pid=1) return await anext(self.gen) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client +(APIServer pid=1) async with build_async_engine_client_from_engine_args( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ +(APIServer pid=1) return await anext(self.gen) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 124, in build_async_engine_client_from_engine_args +(APIServer pid=1) vllm_config = engine_args.create_engine_config(usage_context=usage_context) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/engine/arg_utils.py", line 1539, in create_engine_config +(APIServer pid=1) maybe_override_with_speculators( +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/transformers_utils/config.py", line 583, in maybe_override_with_speculators +(APIServer pid=1) config_dict, _ = PretrainedConfig.get_config_dict( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py", line 662, in get_config_dict +(APIServer pid=1) config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py", line 721, in _get_config_dict +(APIServer pid=1) resolved_config_file = cached_file( +(APIServer pid=1) ^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 322, in cached_file +(APIServer pid=1) file = cached_files(path_or_repo_id=path_or_repo_id, filenames=[filename], **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 553, in cached_files +(APIServer pid=1) raise OSError( +(APIServer pid=1) OSError: We couldn't connect to 'https://huggingface.co' to load the files, and couldn't find them in the cached files. +(APIServer pid=1) Check your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'. diff --git a/accuracy/results/v0.19.0/logs/google-gemma-2-9b-it--h100-80gb--tp1pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/google-gemma-2-9b-it--h100-80gb--tp1pp1dp1--8192.log new file mode 100644 index 00000000..98d9fb14 --- /dev/null +++ b/accuracy/results/v0.19.0/logs/google-gemma-2-9b-it--h100-80gb--tp1pp1dp1--8192.log @@ -0,0 +1,749 @@ +DEBUG 04-22 15:56:20 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 15:56:20 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 15:56:20 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 15:56:20 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 15:56:20 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 15:56:20 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 15:56:20 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 15:56:20 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 15:56:20 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 15:56:20 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 15:56:20 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 15:56:20 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 15:56:25 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 15:56:27 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' +DEBUG 04-22 15:56:27 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 15:56:27 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 15:56:27 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 15:56:27 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 15:56:27 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 15:56:27 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 15:56:27 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 15:56:27 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model google/gemma-2-9b-it +(APIServer pid=1) INFO 04-22 15:56:27 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 15:56:27 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 15:56:27 [entrypoints/utils.py:233] non-default args: {'model_tag': 'google/gemma-2-9b-it', 'model': 'google/gemma-2-9b-it', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 15:56:27 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 15:56:27 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.gemma2.Gemma2ForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 15:56:27 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003558 secs +(APIServer pid=1) INFO 04-22 15:56:27 [config/model.py:549] Resolved architecture: Gemma2ForCausalLM +(APIServer pid=1) INFO 04-22 15:56:27 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 15:56:27 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 15:56:27 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 15:56:27 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 15:56:27 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 15:56:27 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 15:56:27 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 15:56:27 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 15:56:27 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 15:56:27 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 15:56:29 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 15:56:29 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 15:56:33 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 15:56:33 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 15:56:33 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 15:56:33 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 15:56:33 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 15:56:33 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 15:56:33 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 15:56:33 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 15:56:33 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 15:56:33 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 15:56:33 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 15:56:33 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 15:56:38 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=285) DEBUG 04-22 15:56:39 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 15:56:39 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=285) DEBUG 04-22 15:56:39 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/b04747ea-0d9b-4301-a190-f022a3dff208'], outputs=['ipc:///tmp/a862b31a-8f2a-4a45-9ec5-5716564fee40'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=285) DEBUG 04-22 15:56:39 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=285) DEBUG 04-22 15:56:39 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=285) DEBUG 04-22 15:56:39 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=285) DEBUG 04-22 15:56:39 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=285) DEBUG 04-22 15:56:39 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=285) INFO 04-22 15:56:39 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='google/gemma-2-9b-it', speculative_config=None, tokenizer='google/gemma-2-9b-it', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=google/gemma-2-9b-it, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=285) DEBUG 04-22 15:56:40 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.10.211:46303 backend=nccl +(EngineCore pid=285) INFO 04-22 15:56:40 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.10.211:46303 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=285) DEBUG 04-22 15:56:40 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=285) INFO 04-22 15:56:40 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=285) DEBUG 04-22 15:56:40 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776873400.8206272, auto_measure=True +(EngineCore pid=285) DEBUG 04-22 15:56:40 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=285) DEBUG 04-22 15:56:40 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=285) DEBUG 04-22 15:56:40 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=285) DEBUG 04-22 15:56:40 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=285) DEBUG 04-22 15:56:40 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=285) DEBUG 04-22 15:56:40 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=285) DEBUG 04-22 15:56:41 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=285) DEBUG 04-22 15:56:41 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=285) DEBUG 04-22 15:56:41 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=285) INFO 04-22 15:56:41 [v1/worker/gpu_model_runner.py:4735] Starting to load model google/gemma-2-9b-it... +(EngineCore pid=285) DEBUG 04-22 15:56:41 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=256, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=285) INFO 04-22 15:56:41 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=285) INFO 04-22 15:56:41 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=285) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=285) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=285) DEBUG 04-22 15:56:41 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=285) DEBUG 04-22 15:56:41 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=285) DEBUG 04-22 15:56:41 [config/compilation.py:1195] disabled custom ops: Counter({'gemma_rms_norm': 169, 'gelu_and_mul': 42, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'logits_processor': 1}) +(EngineCore pid=285) DEBUG 04-22 15:56:41 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=285) DEBUG 04-22 15:56:42 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00001-of-00004.safetensors']] +(APIServer pid=1) DEBUG 04-22 15:56:49 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 15:56:59 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=285) INFO 04-22 15:57:07 [model_executor/model_loader/weight_utils.py:581] Time spent downloading weights for google/gemma-2-9b-it: 25.415698 seconds +(EngineCore pid=285) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 +(APIServer pid=1) DEBUG 04-22 15:57:29 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/gemma2.py +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=285) INFO 04-22 15:57:30 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/938dccb18b/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=2fa46dba6a comp=e546579c48 code=1effc1a117077c34d7781a48a74d1e9a6c34076dbd4b7937695d68b6a3d23a2c dir=/data/.cache/vllm/torch_compile_cache/938dccb18b/rank_0_0/backbone +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] Vllm config hash: 2fa46dba6a +(EngineCore pid=285) INFO 04-22 15:57:30 [compilation/backends.py:1111] Dynamo bytecode transform time: 5.96 s +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=285) DEBUG 04-22 15:57:31 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=285) DEBUG 04-22 15:57:31 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms +(EngineCore pid=285) DEBUG 04-22 15:57:31 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms +(EngineCore pid=285) DEBUG 04-22 15:57:31 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=285) DEBUG 04-22 15:57:31 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=285) INFO 04-22 15:57:32 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=285) DEBUG 04-22 15:57:32 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/938dccb18b/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=285) DEBUG 04-22 15:57:33 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=285) DEBUG 04-22 15:57:33 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(EngineCore pid=285) DEBUG 04-22 15:57:33 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.4 ms +(EngineCore pid=285) DEBUG 04-22 15:57:33 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=285) DEBUG 04-22 15:57:33 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=285) DEBUG 04-22 15:57:35 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/938dccb18b/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=285) DEBUG 04-22 15:57:36 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=285) DEBUG 04-22 15:57:36 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.2 ms +(EngineCore pid=285) DEBUG 04-22 15:57:36 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.6 ms +(EngineCore pid=285) DEBUG 04-22 15:57:36 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=285) DEBUG 04-22 15:57:36 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(EngineCore pid=285) DEBUG 04-22 15:57:36 [compilation/backends.py:377] Store the 42-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_42', '/data/.cache/vllm/torch_compile_cache/938dccb18b/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_42') +(EngineCore pid=285) INFO 04-22 15:57:36 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 6.31 s +(EngineCore pid=285) DEBUG 04-22 15:57:36 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/938dccb18b/rank_0_0/backbone/computation_graph.py +(EngineCore pid=285) INFO 04-22 15:57:38 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/e599b74f8707bd2734d2d72d08671f09975bf502f65698747fb9e3a189bfc9db/rank_0_0/model +(EngineCore pid=285) INFO 04-22 15:57:38 [compilation/monitor.py:48] torch.compile took 14.31 s in total +(EngineCore pid=285) INFO 04-22 15:57:38 [compilation/monitor.py:76] Initial profiling/warmup run took 0.25 s +(APIServer pid=1) DEBUG 04-22 15:57:39 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=285) INFO 04-22 15:57:44 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=285) DEBUG 04-22 15:57:44 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=285) INFO 04-22 15:57:44 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=285) DEBUG 04-22 15:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=285) DEBUG 04-22 15:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=285) DEBUG 04-22 15:57:44 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=285) DEBUG 04-22 15:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=285) DEBUG 04-22 15:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=285) DEBUG 04-22 15:57:44 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=285) DEBUG 04-22 15:57:44 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 146.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=285) DEBUG 04-22 15:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=285) DEBUG 04-22 15:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=285) DEBUG 04-22 15:57:44 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=285) DEBUG 04-22 15:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=285) DEBUG 04-22 15:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=285) DEBUG 04-22 15:57:44 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=285) DEBUG 04-22 15:57:44 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 262.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=285) DEBUG 04-22 15:57:45 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=285) INFO 04-22 15:57:45 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.84 GiB total +(EngineCore pid=285) DEBUG 04-22 15:57:45 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=285) DEBUG 04-22 15:57:45 [v1/worker/gpu_worker.py:430] Free memory after profiling: 59.37 GiB (total), 55.92 GiB (within requested) +(EngineCore pid=285) DEBUG 04-22 15:57:45 [v1/worker/gpu_worker.py:435] Memory profiling takes 21.51 seconds. Total non KV cache memory: 21.11GiB; torch peak memory increase: 3.65GiB; non-torch forward increase memory: 0.25GiB; weights memory: 17.22GiB. +(EngineCore pid=285) INFO 04-22 15:57:45 [v1/worker/gpu_worker.py:436] Available KV cache memory: 54.12 GiB +(EngineCore pid=285) INFO 04-22 15:57:45 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9606 to maintain the same effective KV cache size. +(EngineCore pid=285) INFO 04-22 15:57:45 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 168,880 tokens +(EngineCore pid=285) INFO 04-22 15:57:45 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 20.60x +(EngineCore pid=285) 2026-04-22 15:57:45,711 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=285) DEBUG 04-22 15:57:45 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=285) 2026-04-22 15:57:45,720 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=285) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 15:50:32 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 15:50:32 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 15:50:32 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 15:50:32 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 15:50:32 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 15:50:32 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model google/gemma-3-12b-it +(APIServer pid=1) INFO 04-22 15:50:32 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 15:50:32 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 15:50:32 [entrypoints/utils.py:233] non-default args: {'model_tag': 'google/gemma-3-12b-it', 'model': 'google/gemma-3-12b-it', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 15:50:32 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) Traceback (most recent call last): +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_http.py", line 403, in hf_raise_for_status +(APIServer pid=1) response.raise_for_status() +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/requests/models.py", line 1028, in raise_for_status +(APIServer pid=1) raise HTTPError(http_error_msg, response=self) +(APIServer pid=1) requests.exceptions.HTTPError: 403 Client Error: Forbidden for url: https://huggingface.co/google/gemma-3-12b-it/resolve/main/config.json +(APIServer pid=1) +(APIServer pid=1) The above exception was the direct cause of the following exception: +(APIServer pid=1) +(APIServer pid=1) Traceback (most recent call last): +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1550, in _get_metadata_or_catch_error +(APIServer pid=1) metadata = get_hf_file_metadata( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn +(APIServer pid=1) return fn(*args, **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1467, in get_hf_file_metadata +(APIServer pid=1) r = _request_wrapper( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 283, in _request_wrapper +(APIServer pid=1) response = _request_wrapper( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 307, in _request_wrapper +(APIServer pid=1) hf_raise_for_status(response) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_http.py", line 467, in hf_raise_for_status +(APIServer pid=1) raise _format(HfHubHTTPError, message, response) from e +(APIServer pid=1) huggingface_hub.errors.HfHubHTTPError: (Request ID: Root=1-69e8ee48-558ebd955f7a279229ca7b78;f864c0bf-514c-4724-85b6-fb6b29a05337) +(APIServer pid=1) +(APIServer pid=1) 403 Forbidden: Please enable access to public gated repositories in your fine-grained token settings to view this repository.. +(APIServer pid=1) Cannot access content at: https://huggingface.co/google/gemma-3-12b-it/resolve/main/config.json. +(APIServer pid=1) Make sure your token has the correct permissions. +(APIServer pid=1) +(APIServer pid=1) The above exception was the direct cause of the following exception: +(APIServer pid=1) +(APIServer pid=1) Traceback (most recent call last): +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 479, in cached_files +(APIServer pid=1) hf_hub_download( +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn +(APIServer pid=1) return fn(*args, **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1014, in hf_hub_download +(APIServer pid=1) return _hf_hub_download_to_cache_dir( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1121, in _hf_hub_download_to_cache_dir +(APIServer pid=1) _raise_on_head_call_error(head_call_error, force_download, local_files_only) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1665, in _raise_on_head_call_error +(APIServer pid=1) raise LocalEntryNotFoundError( +(APIServer pid=1) huggingface_hub.errors.LocalEntryNotFoundError: An error happened while trying to locate the file on the Hub and we cannot find the requested files in the local cache. Please check your connection and try again or make sure your Internet connection is on. +(APIServer pid=1) +(APIServer pid=1) The above exception was the direct cause of the following exception: +(APIServer pid=1) +(APIServer pid=1) Traceback (most recent call last): +(APIServer pid=1) File "/usr/local/bin/vllm", line 10, in +(APIServer pid=1) sys.exit(main()) +(APIServer pid=1) ^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main +(APIServer pid=1) args.dispatch_function(args) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd +(APIServer pid=1) uvloop.run(run_server(args)) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run +(APIServer pid=1) return __asyncio.run( +(APIServer pid=1) ^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run +(APIServer pid=1) return runner.run(main) +(APIServer pid=1) ^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run +(APIServer pid=1) return self._loop.run_until_complete(task) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper +(APIServer pid=1) return await main +(APIServer pid=1) ^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server +(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker +(APIServer pid=1) async with build_async_engine_client( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ +(APIServer pid=1) return await anext(self.gen) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client +(APIServer pid=1) async with build_async_engine_client_from_engine_args( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ +(APIServer pid=1) return await anext(self.gen) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 124, in build_async_engine_client_from_engine_args +(APIServer pid=1) vllm_config = engine_args.create_engine_config(usage_context=usage_context) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/engine/arg_utils.py", line 1539, in create_engine_config +(APIServer pid=1) maybe_override_with_speculators( +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/transformers_utils/config.py", line 583, in maybe_override_with_speculators +(APIServer pid=1) config_dict, _ = PretrainedConfig.get_config_dict( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py", line 662, in get_config_dict +(APIServer pid=1) config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py", line 721, in _get_config_dict +(APIServer pid=1) resolved_config_file = cached_file( +(APIServer pid=1) ^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 322, in cached_file +(APIServer pid=1) file = cached_files(path_or_repo_id=path_or_repo_id, filenames=[filename], **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 553, in cached_files +(APIServer pid=1) raise OSError( +(APIServer pid=1) OSError: We couldn't connect to 'https://huggingface.co' to load the files, and couldn't find them in the cached files. +(APIServer pid=1) Check your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'. diff --git a/accuracy/results/v0.19.0/logs/google-gemma-3-12b-it--h100-80gb--tp1pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/google-gemma-3-12b-it--h100-80gb--tp1pp1dp1--8192.log new file mode 100644 index 00000000..354a2364 --- /dev/null +++ b/accuracy/results/v0.19.0/logs/google-gemma-3-12b-it--h100-80gb--tp1pp1dp1--8192.log @@ -0,0 +1,769 @@ +DEBUG 04-22 16:04:02 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 16:04:02 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 16:04:02 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 16:04:02 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 16:04:02 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 16:04:02 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 16:04:02 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 16:04:02 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 16:04:02 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 16:04:02 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 16:04:02 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 16:04:02 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 16:04:07 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 16:04:09 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' +DEBUG 04-22 16:04:09 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 16:04:09 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 16:04:09 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 16:04:09 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 16:04:09 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 16:04:09 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 16:04:09 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 16:04:09 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model google/gemma-3-12b-it +(APIServer pid=1) INFO 04-22 16:04:09 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 16:04:09 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 16:04:09 [entrypoints/utils.py:233] non-default args: {'model_tag': 'google/gemma-3-12b-it', 'model': 'google/gemma-3-12b-it', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 16:04:09 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 16:04:09 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.gemma3_mm.Gemma3ForConditionalGeneration from cache +(APIServer pid=1) DEBUG 04-22 16:04:09 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0004434 secs +(APIServer pid=1) INFO 04-22 16:04:09 [config/model.py:549] Resolved architecture: Gemma3ForConditionalGeneration +(APIServer pid=1) INFO 04-22 16:04:09 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 16:04:09 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 16:04:09 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 16:04:09 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 16:04:09 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 16:04:09 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 16:04:09 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 16:04:09 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) WARNING 04-22 16:04:09 [platforms/cuda.py:199] Forcing --disable_chunked_mm_input for models with multimodal-bidirectional attention. +(APIServer pid=1) DEBUG 04-22 16:04:09 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 16:04:09 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 16:04:12 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 16:04:13 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. +(APIServer pid=1) DEBUG 04-22 16:04:14 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +(APIServer pid=1) Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +DEBUG 04-22 16:04:26 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 16:04:26 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 16:04:26 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 16:04:26 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 16:04:26 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 16:04:26 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 16:04:26 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 16:04:26 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 16:04:26 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 16:04:26 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 16:04:26 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 16:04:26 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 16:04:31 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(APIServer pid=1) DEBUG 04-22 16:04:32 [v1/engine/utils.py:1042] Waiting for 1 local, 0 remote core engine proc(s) to connect. +(EngineCore pid=510) DEBUG 04-22 16:04:33 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 16:04:33 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=510) DEBUG 04-22 16:04:33 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/908ab5ba-95d5-4a37-881e-6edc152f135d'], outputs=['ipc:///tmp/36f6cabd-2ae1-4604-93cc-5accfd67e0cd'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=510) DEBUG 04-22 16:04:33 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=510) DEBUG 04-22 16:04:33 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=510) DEBUG 04-22 16:04:33 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=510) DEBUG 04-22 16:04:33 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=510) DEBUG 04-22 16:04:33 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=510) INFO 04-22 16:04:33 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='google/gemma-3-12b-it', speculative_config=None, tokenizer='google/gemma-3-12b-it', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=google/gemma-3-12b-it, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=510) DEBUG 04-22 16:04:33 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(EngineCore pid=510) DEBUG 04-22 16:04:35 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.10.219:60697 backend=nccl +(EngineCore pid=510) INFO 04-22 16:04:35 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.10.219:60697 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=510) DEBUG 04-22 16:04:35 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=510) INFO 04-22 16:04:35 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=510) DEBUG 04-22 16:04:35 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776873875.928553, auto_measure=True +(EngineCore pid=510) DEBUG 04-22 16:04:35 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=510) DEBUG 04-22 16:04:36 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=510) DEBUG 04-22 16:04:36 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=510) DEBUG 04-22 16:04:36 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=510) DEBUG 04-22 16:04:36 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=510) DEBUG 04-22 16:04:36 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=510) DEBUG 04-22 16:04:36 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=510) DEBUG 04-22 16:04:36 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. +(EngineCore pid=510) Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +(APIServer pid=1) DEBUG 04-22 16:04:43 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=510) DEBUG 04-22 16:04:44 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=510) INFO 04-22 16:04:44 [v1/worker/gpu_model_runner.py:4735] Starting to load model google/gemma-3-12b-it... +(EngineCore pid=510) INFO 04-22 16:04:45 [model_executor/models/interfaces.py:171] Contains out of vocabulary multimodal tokens? False +(EngineCore pid=510) INFO 04-22 16:04:45 [platforms/cuda.py:390] Using backend AttentionBackendEnum.FLASH_ATTN for vit attention +(EngineCore pid=510) INFO 04-22 16:04:45 [model_executor/.../attention/mm_encoder_attention.py:230] Using AttentionBackendEnum.FLASH_ATTN for MMEncoderAttention. +(EngineCore pid=510) INFO 04-22 16:04:45 [config/vllm.py:790] Asynchronous scheduling is enabled. +(EngineCore pid=510) DEBUG 04-22 16:04:45 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=510) DEBUG 04-22 16:04:45 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=256, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=True, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {FLASH_ATTN: [partial multimodal token full attention not supported], FLASHINFER: [partial multimodal token full attention not supported]}. +(EngineCore pid=510) INFO 04-22 16:04:45 [platforms/cuda.py:334] Using TRITON_ATTN attention backend out of potential backends: ['TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=510) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=510) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=510) DEBUG 04-22 16:04:45 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=510) DEBUG 04-22 16:04:45 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 27}) +(EngineCore pid=510) DEBUG 04-22 16:04:45 [config/compilation.py:1195] disabled custom ops: Counter({'gemma_rms_norm': 290, 'gelu_and_mul': 48, 'rotary_embedding': 2, 'apply_rotary_emb': 2, 'conv2d': 1, 'vocab_parallel_embedding': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=510) DEBUG 04-22 16:04:45 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 27}) +(EngineCore pid=510) DEBUG 04-22 16:04:45 [config/compilation.py:1195] disabled custom ops: Counter({'gemma_rms_norm': 290, 'gelu_and_mul': 48, 'rotary_embedding': 2, 'apply_rotary_emb': 2, 'conv2d': 1, 'vocab_parallel_embedding': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=510) DEBUG 04-22 16:04:45 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=510) DEBUG 04-22 16:04:45 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00003-of-00005.safetensors', 'model-00005-of-00005.safetensors', 'model-00001-of-00005.safetensors', 'model-00002-of-00005.safetensors', 'model-00004-of-00005.safetensors']] +(APIServer pid=1) DEBUG 04-22 16:04:53 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 16:05:03 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 16:05:13 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 16:05:23 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=510) INFO 04-22 16:05:24 [model_executor/model_loader/weight_utils.py:581] Time spent downloading weights for google/gemma-3-12b-it: 38.575716 seconds +(EngineCore pid=510) Loading safetensors checkpoint shards: 0% Completed | 0/5 [00:00 +(APIServer pid=1) DEBUG 04-22 16:05:53 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/gemma3.py +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=510) INFO 04-22 16:05:55 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/f76d04716e/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=af89a8e94b comp=e546579c48 code=616836617014c5f9fc7251fa87e8f62cc5716448a3f7628492691f84ac2574d9 dir=/data/.cache/vllm/torch_compile_cache/f76d04716e/rank_0_0/backbone +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] Vllm config hash: af89a8e94b +(EngineCore pid=510) INFO 04-22 16:05:55 [compilation/backends.py:1111] Dynamo bytecode transform time: 7.88 s +(EngineCore pid=510) DEBUG 04-22 16:05:56 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=510) DEBUG 04-22 16:05:56 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=510) DEBUG 04-22 16:05:57 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=510) DEBUG 04-22 16:05:57 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(EngineCore pid=510) DEBUG 04-22 16:05:57 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.0 ms +(EngineCore pid=510) DEBUG 04-22 16:05:57 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=510) DEBUG 04-22 16:05:57 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=510) INFO 04-22 16:06:01 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=510) DEBUG 04-22 16:06:01 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/f76d04716e/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=510) DEBUG 04-22 16:06:01 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=510) DEBUG 04-22 16:06:01 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(EngineCore pid=510) DEBUG 04-22 16:06:01 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.7 ms +(EngineCore pid=510) DEBUG 04-22 16:06:01 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=510) DEBUG 04-22 16:06:01 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(APIServer pid=1) DEBUG 04-22 16:06:03 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=510) DEBUG 04-22 16:06:04 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/f76d04716e/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=510) DEBUG 04-22 16:06:04 [compilation/backends.py:377] Store the 2-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_2', '/data/.cache/vllm/torch_compile_cache/f76d04716e/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_2') +(EngineCore pid=510) DEBUG 04-22 16:06:05 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=510) DEBUG 04-22 16:06:05 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(EngineCore pid=510) DEBUG 04-22 16:06:05 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.7 ms +(EngineCore pid=510) DEBUG 04-22 16:06:05 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=510) DEBUG 04-22 16:06:05 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=510) DEBUG 04-22 16:06:07 [compilation/backends.py:377] Store the 5-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_5', '/data/.cache/vllm/torch_compile_cache/f76d04716e/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_5') +(EngineCore pid=510) DEBUG 04-22 16:06:08 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=510) DEBUG 04-22 16:06:08 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.3 ms +(EngineCore pid=510) DEBUG 04-22 16:06:08 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.5 ms +(EngineCore pid=510) DEBUG 04-22 16:06:08 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=510) DEBUG 04-22 16:06:08 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(EngineCore pid=510) DEBUG 04-22 16:06:08 [compilation/backends.py:377] Store the 48-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_48', '/data/.cache/vllm/torch_compile_cache/f76d04716e/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_48') +(EngineCore pid=510) INFO 04-22 16:06:08 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 12.51 s +(EngineCore pid=510) DEBUG 04-22 16:06:08 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/f76d04716e/rank_0_0/backbone/computation_graph.py +(EngineCore pid=510) INFO 04-22 16:06:11 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/172a779f211974e33e1f8d0b4be257e092b93b266784f6a1ef47398d3ac31e46/rank_0_0/model +(EngineCore pid=510) INFO 04-22 16:06:11 [compilation/monitor.py:48] torch.compile took 23.21 s in total +(EngineCore pid=510) INFO 04-22 16:06:11 [compilation/monitor.py:76] Initial profiling/warmup run took 0.41 s +(APIServer pid=1) DEBUG 04-22 16:06:13 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=510) INFO 04-22 16:06:17 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=510) DEBUG 04-22 16:06:17 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=510) INFO 04-22 16:06:17 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=510) DEBUG 04-22 16:06:18 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=510) DEBUG 04-22 16:06:18 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=510) DEBUG 04-22 16:06:18 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=510) DEBUG 04-22 16:06:18 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=510) DEBUG 04-22 16:06:18 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=510) DEBUG 04-22 16:06:18 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=510) DEBUG 04-22 16:06:18 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 146.00 MiB first-capture + (51-1) × 10.00 MiB per-graph +(EngineCore pid=510) DEBUG 04-22 16:06:18 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=510) DEBUG 04-22 16:06:19 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=510) DEBUG 04-22 16:06:19 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=510) DEBUG 04-22 16:06:19 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=510) DEBUG 04-22 16:06:19 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=510) DEBUG 04-22 16:06:19 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=510) DEBUG 04-22 16:06:19 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 4.00 MiB first-capture + (51-1) × 8.00 MiB per-graph +(EngineCore pid=510) DEBUG 04-22 16:06:20 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=510) INFO 04-22 16:06:20 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.02 GiB total +(EngineCore pid=510) DEBUG 04-22 16:06:20 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=510) DEBUG 04-22 16:06:20 [v1/worker/gpu_worker.py:430] Free memory after profiling: 54.52 GiB (total), 51.07 GiB (within requested) +(EngineCore pid=510) DEBUG 04-22 16:06:20 [v1/worker/gpu_worker.py:435] Memory profiling takes 33.32 seconds. Total non KV cache memory: 27.51GiB; torch peak memory increase: 3.94GiB; non-torch forward increase memory: 0.25GiB; weights memory: 23.31GiB. +(EngineCore pid=510) INFO 04-22 16:06:20 [v1/worker/gpu_worker.py:436] Available KV cache memory: 47.72 GiB +(EngineCore pid=510) INFO 04-22 16:06:20 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9629 to maintain the same effective KV cache size. +(EngineCore pid=510) INFO 04-22 16:06:20 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 130,304 tokens +(EngineCore pid=510) INFO 04-22 16:06:20 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 15.88x +(EngineCore pid=510) 2026-04-22 16:06:20,756 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=510) DEBUG 04-22 16:06:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=510) 2026-04-22 16:06:20,771 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=510) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 15:50:45 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 15:50:45 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 15:50:45 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 15:50:45 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 15:50:45 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 15:50:45 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model google/gemma-3-27b-it +(APIServer pid=1) INFO 04-22 15:50:45 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 15:50:45 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 15:50:45 [entrypoints/utils.py:233] non-default args: {'model_tag': 'google/gemma-3-27b-it', 'model': 'google/gemma-3-27b-it', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 15:50:45 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) Traceback (most recent call last): +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_http.py", line 403, in hf_raise_for_status +(APIServer pid=1) response.raise_for_status() +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/requests/models.py", line 1028, in raise_for_status +(APIServer pid=1) raise HTTPError(http_error_msg, response=self) +(APIServer pid=1) requests.exceptions.HTTPError: 403 Client Error: Forbidden for url: https://huggingface.co/google/gemma-3-27b-it/resolve/main/config.json +(APIServer pid=1) +(APIServer pid=1) The above exception was the direct cause of the following exception: +(APIServer pid=1) +(APIServer pid=1) Traceback (most recent call last): +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1550, in _get_metadata_or_catch_error +(APIServer pid=1) metadata = get_hf_file_metadata( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn +(APIServer pid=1) return fn(*args, **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1467, in get_hf_file_metadata +(APIServer pid=1) r = _request_wrapper( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 283, in _request_wrapper +(APIServer pid=1) response = _request_wrapper( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 307, in _request_wrapper +(APIServer pid=1) hf_raise_for_status(response) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_http.py", line 467, in hf_raise_for_status +(APIServer pid=1) raise _format(HfHubHTTPError, message, response) from e +(APIServer pid=1) huggingface_hub.errors.HfHubHTTPError: (Request ID: Root=1-69e8ee56-748b87a6382fbdc8082d9120;9cf089a8-e061-415a-b1d0-5ee33e12c23e) +(APIServer pid=1) +(APIServer pid=1) 403 Forbidden: Please enable access to public gated repositories in your fine-grained token settings to view this repository.. +(APIServer pid=1) Cannot access content at: https://huggingface.co/google/gemma-3-27b-it/resolve/main/config.json. +(APIServer pid=1) Make sure your token has the correct permissions. +(APIServer pid=1) +(APIServer pid=1) The above exception was the direct cause of the following exception: +(APIServer pid=1) +(APIServer pid=1) Traceback (most recent call last): +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 479, in cached_files +(APIServer pid=1) hf_hub_download( +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn +(APIServer pid=1) return fn(*args, **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1014, in hf_hub_download +(APIServer pid=1) return _hf_hub_download_to_cache_dir( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1121, in _hf_hub_download_to_cache_dir +(APIServer pid=1) _raise_on_head_call_error(head_call_error, force_download, local_files_only) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1665, in _raise_on_head_call_error +(APIServer pid=1) raise LocalEntryNotFoundError( +(APIServer pid=1) huggingface_hub.errors.LocalEntryNotFoundError: An error happened while trying to locate the file on the Hub and we cannot find the requested files in the local cache. Please check your connection and try again or make sure your Internet connection is on. +(APIServer pid=1) +(APIServer pid=1) The above exception was the direct cause of the following exception: +(APIServer pid=1) +(APIServer pid=1) Traceback (most recent call last): +(APIServer pid=1) File "/usr/local/bin/vllm", line 10, in +(APIServer pid=1) sys.exit(main()) +(APIServer pid=1) ^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main +(APIServer pid=1) args.dispatch_function(args) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd +(APIServer pid=1) uvloop.run(run_server(args)) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run +(APIServer pid=1) return __asyncio.run( +(APIServer pid=1) ^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run +(APIServer pid=1) return runner.run(main) +(APIServer pid=1) ^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run +(APIServer pid=1) return self._loop.run_until_complete(task) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper +(APIServer pid=1) return await main +(APIServer pid=1) ^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server +(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker +(APIServer pid=1) async with build_async_engine_client( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ +(APIServer pid=1) return await anext(self.gen) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client +(APIServer pid=1) async with build_async_engine_client_from_engine_args( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ +(APIServer pid=1) return await anext(self.gen) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 124, in build_async_engine_client_from_engine_args +(APIServer pid=1) vllm_config = engine_args.create_engine_config(usage_context=usage_context) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/engine/arg_utils.py", line 1539, in create_engine_config +(APIServer pid=1) maybe_override_with_speculators( +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/transformers_utils/config.py", line 583, in maybe_override_with_speculators +(APIServer pid=1) config_dict, _ = PretrainedConfig.get_config_dict( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py", line 662, in get_config_dict +(APIServer pid=1) config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py", line 721, in _get_config_dict +(APIServer pid=1) resolved_config_file = cached_file( +(APIServer pid=1) ^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 322, in cached_file +(APIServer pid=1) file = cached_files(path_or_repo_id=path_or_repo_id, filenames=[filename], **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 553, in cached_files +(APIServer pid=1) raise OSError( +(APIServer pid=1) OSError: We couldn't connect to 'https://huggingface.co' to load the files, and couldn't find them in the cached files. +(APIServer pid=1) Check your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'. diff --git a/accuracy/results/v0.19.0/logs/google-gemma-3-27b-it--h100-80gb--tp1pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/google-gemma-3-27b-it--h100-80gb--tp1pp1dp1--8192.log new file mode 100644 index 00000000..df501599 --- /dev/null +++ b/accuracy/results/v0.19.0/logs/google-gemma-3-27b-it--h100-80gb--tp1pp1dp1--8192.log @@ -0,0 +1,783 @@ +DEBUG 04-22 16:06:52 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 16:06:52 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 16:06:52 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 16:06:52 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 16:06:52 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 16:06:52 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 16:06:52 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 16:06:52 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 16:06:52 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 16:06:52 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 16:06:52 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 16:06:52 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 16:06:57 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 16:06:59 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' +DEBUG 04-22 16:06:59 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 16:06:59 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 16:06:59 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 16:06:59 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 16:06:59 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 16:06:59 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 16:06:59 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 16:06:59 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model google/gemma-3-27b-it +(APIServer pid=1) INFO 04-22 16:06:59 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 16:06:59 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 16:06:59 [entrypoints/utils.py:233] non-default args: {'model_tag': 'google/gemma-3-27b-it', 'model': 'google/gemma-3-27b-it', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 16:06:59 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 16:06:59 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.gemma3_mm.Gemma3ForConditionalGeneration from cache +(APIServer pid=1) DEBUG 04-22 16:06:59 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003820 secs +(APIServer pid=1) INFO 04-22 16:06:59 [config/model.py:549] Resolved architecture: Gemma3ForConditionalGeneration +(APIServer pid=1) INFO 04-22 16:06:59 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 16:06:59 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 16:06:59 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 16:06:59 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 16:06:59 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 16:06:59 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 16:06:59 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 16:06:59 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) WARNING 04-22 16:06:59 [platforms/cuda.py:199] Forcing --disable_chunked_mm_input for models with multimodal-bidirectional attention. +(APIServer pid=1) DEBUG 04-22 16:06:59 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 16:06:59 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 16:07:02 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 16:07:03 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. +(APIServer pid=1) DEBUG 04-22 16:07:03 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +(APIServer pid=1) Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +DEBUG 04-22 16:07:15 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 16:07:15 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 16:07:15 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 16:07:15 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 16:07:15 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 16:07:15 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 16:07:15 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 16:07:15 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 16:07:15 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 16:07:15 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 16:07:15 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 16:07:15 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 16:07:20 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=509) DEBUG 04-22 16:07:22 [v1/engine/core.py:1018] Waiting for init message from front-end. +(EngineCore pid=509) DEBUG 04-22 16:07:22 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/1376714c-14a9-478f-94c9-ec390c1e5ba5'], outputs=['ipc:///tmp/528325d5-7c95-4fb2-89cd-d7206a1f3d4b'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(APIServer pid=1) DEBUG 04-22 16:07:22 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=509) DEBUG 04-22 16:07:22 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=509) DEBUG 04-22 16:07:22 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=509) DEBUG 04-22 16:07:22 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=509) DEBUG 04-22 16:07:22 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=509) DEBUG 04-22 16:07:22 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=509) INFO 04-22 16:07:22 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='google/gemma-3-27b-it', speculative_config=None, tokenizer='google/gemma-3-27b-it', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=google/gemma-3-27b-it, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=509) DEBUG 04-22 16:07:22 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(EngineCore pid=509) DEBUG 04-22 16:07:24 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.10.228:44139 backend=nccl +(EngineCore pid=509) INFO 04-22 16:07:24 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.10.228:44139 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=509) DEBUG 04-22 16:07:24 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=509) INFO 04-22 16:07:24 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=509) DEBUG 04-22 16:07:24 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776874044.4904332, auto_measure=True +(EngineCore pid=509) DEBUG 04-22 16:07:24 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=509) DEBUG 04-22 16:07:24 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=509) DEBUG 04-22 16:07:24 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=509) DEBUG 04-22 16:07:24 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=509) DEBUG 04-22 16:07:24 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=509) DEBUG 04-22 16:07:24 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=509) DEBUG 04-22 16:07:24 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=509) DEBUG 04-22 16:07:24 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. +(EngineCore pid=509) Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +(APIServer pid=1) DEBUG 04-22 16:07:32 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=509) DEBUG 04-22 16:07:32 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=509) INFO 04-22 16:07:32 [v1/worker/gpu_model_runner.py:4735] Starting to load model google/gemma-3-27b-it... +(EngineCore pid=509) INFO 04-22 16:07:32 [model_executor/models/interfaces.py:171] Contains out of vocabulary multimodal tokens? False +(EngineCore pid=509) INFO 04-22 16:07:32 [platforms/cuda.py:390] Using backend AttentionBackendEnum.FLASH_ATTN for vit attention +(EngineCore pid=509) INFO 04-22 16:07:32 [model_executor/.../attention/mm_encoder_attention.py:230] Using AttentionBackendEnum.FLASH_ATTN for MMEncoderAttention. +(EngineCore pid=509) INFO 04-22 16:07:32 [config/vllm.py:790] Asynchronous scheduling is enabled. +(EngineCore pid=509) DEBUG 04-22 16:07:32 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=509) DEBUG 04-22 16:07:33 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=True, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {FLASH_ATTN: [partial multimodal token full attention not supported], FLASHINFER: [partial multimodal token full attention not supported]}. +(EngineCore pid=509) INFO 04-22 16:07:33 [platforms/cuda.py:334] Using TRITON_ATTN attention backend out of potential backends: ['TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=509) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=509) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=509) DEBUG 04-22 16:07:33 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=509) DEBUG 04-22 16:07:33 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 27}) +(EngineCore pid=509) DEBUG 04-22 16:07:33 [config/compilation.py:1195] disabled custom ops: Counter({'gemma_rms_norm': 374, 'gelu_and_mul': 62, 'rotary_embedding': 2, 'apply_rotary_emb': 2, 'conv2d': 1, 'vocab_parallel_embedding': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=509) DEBUG 04-22 16:07:33 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 27}) +(EngineCore pid=509) DEBUG 04-22 16:07:33 [config/compilation.py:1195] disabled custom ops: Counter({'gemma_rms_norm': 374, 'gelu_and_mul': 62, 'rotary_embedding': 2, 'apply_rotary_emb': 2, 'conv2d': 1, 'vocab_parallel_embedding': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=509) DEBUG 04-22 16:07:33 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=509) DEBUG 04-22 16:07:33 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00012-of-00012.safetensors', 'model-00010-of-00012.safetensors', 'model-00003-of-00012.safetensors', 'model-00006-of-00012.safetensors', 'model-00007-of-00012.safetensors', 'model-00005-of-00012.safetensors', 'model-00001-of-00012.safetensors', 'model-00009-of-00012.safetensors', 'model-00011-of-00012.safetensors', 'model-00004-of-00012.safetensors', 'model-00008-of-00012.safetensors', 'model-00002-of-00012.safetensors']] +(APIServer pid=1) DEBUG 04-22 16:07:42 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 16:07:52 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 16:08:02 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 16:08:12 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 16:08:22 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 16:08:32 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 16:08:42 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=509) INFO 04-22 16:08:48 [model_executor/model_loader/weight_utils.py:581] Time spent downloading weights for google/gemma-3-27b-it: 75.376589 seconds +(EngineCore pid=509) Loading safetensors checkpoint shards: 0% Completed | 0/12 [00:00 +(APIServer pid=1) DEBUG 04-22 16:09:42 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/gemma3.py +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=509) INFO 04-22 16:09:46 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/2f8ea44192/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=cb6dae3406 comp=e546579c48 code=616836617014c5f9fc7251fa87e8f62cc5716448a3f7628492691f84ac2574d9 dir=/data/.cache/vllm/torch_compile_cache/2f8ea44192/rank_0_0/backbone +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] Vllm config hash: cb6dae3406 +(EngineCore pid=509) INFO 04-22 16:09:46 [compilation/backends.py:1111] Dynamo bytecode transform time: 9.32 s +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=509) DEBUG 04-22 16:09:47 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=509) DEBUG 04-22 16:09:47 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 3.3 ms +(EngineCore pid=509) DEBUG 04-22 16:09:47 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 5.0 ms +(EngineCore pid=509) DEBUG 04-22 16:09:47 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=509) DEBUG 04-22 16:09:47 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(EngineCore pid=509) INFO 04-22 16:09:51 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=509) DEBUG 04-22 16:09:51 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/2f8ea44192/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=509) DEBUG 04-22 16:09:51 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=509) DEBUG 04-22 16:09:51 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(EngineCore pid=509) DEBUG 04-22 16:09:51 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.7 ms +(EngineCore pid=509) DEBUG 04-22 16:09:51 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=509) DEBUG 04-22 16:09:51 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(APIServer pid=1) DEBUG 04-22 16:09:52 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=509) DEBUG 04-22 16:09:54 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/2f8ea44192/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=509) DEBUG 04-22 16:09:54 [compilation/backends.py:377] Store the 2-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_2', '/data/.cache/vllm/torch_compile_cache/2f8ea44192/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_2') +(EngineCore pid=509) DEBUG 04-22 16:09:54 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=509) DEBUG 04-22 16:09:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(EngineCore pid=509) DEBUG 04-22 16:09:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 2.2 ms +(EngineCore pid=509) DEBUG 04-22 16:09:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=509) DEBUG 04-22 16:09:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=509) DEBUG 04-22 16:09:57 [compilation/backends.py:377] Store the 5-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_5', '/data/.cache/vllm/torch_compile_cache/2f8ea44192/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_5') +(EngineCore pid=509) DEBUG 04-22 16:09:58 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=509) DEBUG 04-22 16:09:58 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.2 ms +(EngineCore pid=509) DEBUG 04-22 16:09:58 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.4 ms +(EngineCore pid=509) DEBUG 04-22 16:09:58 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=509) DEBUG 04-22 16:09:58 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(EngineCore pid=509) DEBUG 04-22 16:09:59 [compilation/backends.py:377] Store the 62-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_62', '/data/.cache/vllm/torch_compile_cache/2f8ea44192/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_62') +(EngineCore pid=509) INFO 04-22 16:09:59 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 12.34 s +(EngineCore pid=509) DEBUG 04-22 16:09:59 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/2f8ea44192/rank_0_0/backbone/computation_graph.py +(EngineCore pid=509) INFO 04-22 16:10:01 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/b84fd6dd09ba72e7f973ec2576a31cb1fbd82907c108bab96cab78bc2c00c365/rank_0_0/model +(EngineCore pid=509) INFO 04-22 16:10:01 [compilation/monitor.py:48] torch.compile took 25.25 s in total +(APIServer pid=1) DEBUG 04-22 16:10:02 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=509) INFO 04-22 16:10:02 [compilation/monitor.py:76] Initial profiling/warmup run took 0.48 s +(EngineCore pid=509) WARNING 04-22 16:10:08 [v1/core/kv_cache_utils.py:1059] Add 8 padding layers, may waste at most 15.38% KV cache memory +(EngineCore pid=509) INFO 04-22 16:10:08 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=509) DEBUG 04-22 16:10:08 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=509) INFO 04-22 16:10:08 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=509) DEBUG 04-22 16:10:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=509) DEBUG 04-22 16:10:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=509) DEBUG 04-22 16:10:09 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=509) DEBUG 04-22 16:10:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=509) DEBUG 04-22 16:10:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=509) DEBUG 04-22 16:10:09 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=509) DEBUG 04-22 16:10:09 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 172.00 MiB first-capture + (51-1) × 12.00 MiB per-graph +(EngineCore pid=509) DEBUG 04-22 16:10:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=509) DEBUG 04-22 16:10:10 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=509) DEBUG 04-22 16:10:10 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=509) DEBUG 04-22 16:10:10 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=509) DEBUG 04-22 16:10:10 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=509) DEBUG 04-22 16:10:10 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=509) DEBUG 04-22 16:10:10 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 8.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=509) DEBUG 04-22 16:10:11 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=509) INFO 04-22 16:10:11 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.05 GiB total +(EngineCore pid=509) DEBUG 04-22 16:10:11 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=509) DEBUG 04-22 16:10:11 [v1/worker/gpu_worker.py:430] Free memory after profiling: 26.33 GiB (total), 22.88 GiB (within requested) +(EngineCore pid=509) DEBUG 04-22 16:10:11 [v1/worker/gpu_worker.py:435] Memory profiling takes 35.21 seconds. Total non KV cache memory: 55.7GiB; torch peak memory increase: 3.99GiB; non-torch forward increase memory: 0.26GiB; weights memory: 51.45GiB. +(EngineCore pid=509) INFO 04-22 16:10:11 [v1/worker/gpu_worker.py:436] Available KV cache memory: 19.53 GiB +(EngineCore pid=509) INFO 04-22 16:10:11 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9632 to maintain the same effective KV cache size. +(EngineCore pid=509) WARNING 04-22 16:10:11 [v1/core/kv_cache_utils.py:1059] Add 8 padding layers, may waste at most 15.38% KV cache memory +(EngineCore pid=509) INFO 04-22 16:10:11 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 36,560 tokens +(EngineCore pid=509) INFO 04-22 16:10:11 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 4.46x +(EngineCore pid=509) 2026-04-22 16:10:11,385 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=509) DEBUG 04-22 16:10:11 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=509) 2026-04-22 16:10:11,403 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=509) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 15:50:19 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 15:50:19 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 15:50:19 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 15:50:19 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 15:50:19 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 15:50:19 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model google/gemma-3-4b-it +(APIServer pid=1) INFO 04-22 15:50:19 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 15:50:19 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 15:50:19 [entrypoints/utils.py:233] non-default args: {'model_tag': 'google/gemma-3-4b-it', 'model': 'google/gemma-3-4b-it', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 15:50:19 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) Traceback (most recent call last): +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_http.py", line 403, in hf_raise_for_status +(APIServer pid=1) response.raise_for_status() +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/requests/models.py", line 1028, in raise_for_status +(APIServer pid=1) raise HTTPError(http_error_msg, response=self) +(APIServer pid=1) requests.exceptions.HTTPError: 403 Client Error: Forbidden for url: https://huggingface.co/google/gemma-3-4b-it/resolve/main/config.json +(APIServer pid=1) +(APIServer pid=1) The above exception was the direct cause of the following exception: +(APIServer pid=1) +(APIServer pid=1) Traceback (most recent call last): +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1550, in _get_metadata_or_catch_error +(APIServer pid=1) metadata = get_hf_file_metadata( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn +(APIServer pid=1) return fn(*args, **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1467, in get_hf_file_metadata +(APIServer pid=1) r = _request_wrapper( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 283, in _request_wrapper +(APIServer pid=1) response = _request_wrapper( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 307, in _request_wrapper +(APIServer pid=1) hf_raise_for_status(response) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_http.py", line 467, in hf_raise_for_status +(APIServer pid=1) raise _format(HfHubHTTPError, message, response) from e +(APIServer pid=1) huggingface_hub.errors.HfHubHTTPError: (Request ID: Root=1-69e8ee3b-39f0338c303d9ca327360ac7;6d67260f-9456-4082-9f26-baaf306ee9c3) +(APIServer pid=1) +(APIServer pid=1) 403 Forbidden: Please enable access to public gated repositories in your fine-grained token settings to view this repository.. +(APIServer pid=1) Cannot access content at: https://huggingface.co/google/gemma-3-4b-it/resolve/main/config.json. +(APIServer pid=1) Make sure your token has the correct permissions. +(APIServer pid=1) +(APIServer pid=1) The above exception was the direct cause of the following exception: +(APIServer pid=1) +(APIServer pid=1) Traceback (most recent call last): +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 479, in cached_files +(APIServer pid=1) hf_hub_download( +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn +(APIServer pid=1) return fn(*args, **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1014, in hf_hub_download +(APIServer pid=1) return _hf_hub_download_to_cache_dir( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1121, in _hf_hub_download_to_cache_dir +(APIServer pid=1) _raise_on_head_call_error(head_call_error, force_download, local_files_only) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1665, in _raise_on_head_call_error +(APIServer pid=1) raise LocalEntryNotFoundError( +(APIServer pid=1) huggingface_hub.errors.LocalEntryNotFoundError: An error happened while trying to locate the file on the Hub and we cannot find the requested files in the local cache. Please check your connection and try again or make sure your Internet connection is on. +(APIServer pid=1) +(APIServer pid=1) The above exception was the direct cause of the following exception: +(APIServer pid=1) +(APIServer pid=1) Traceback (most recent call last): +(APIServer pid=1) File "/usr/local/bin/vllm", line 10, in +(APIServer pid=1) sys.exit(main()) +(APIServer pid=1) ^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main +(APIServer pid=1) args.dispatch_function(args) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd +(APIServer pid=1) uvloop.run(run_server(args)) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run +(APIServer pid=1) return __asyncio.run( +(APIServer pid=1) ^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run +(APIServer pid=1) return runner.run(main) +(APIServer pid=1) ^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run +(APIServer pid=1) return self._loop.run_until_complete(task) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper +(APIServer pid=1) return await main +(APIServer pid=1) ^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server +(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker +(APIServer pid=1) async with build_async_engine_client( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ +(APIServer pid=1) return await anext(self.gen) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client +(APIServer pid=1) async with build_async_engine_client_from_engine_args( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ +(APIServer pid=1) return await anext(self.gen) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 124, in build_async_engine_client_from_engine_args +(APIServer pid=1) vllm_config = engine_args.create_engine_config(usage_context=usage_context) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/engine/arg_utils.py", line 1539, in create_engine_config +(APIServer pid=1) maybe_override_with_speculators( +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/transformers_utils/config.py", line 583, in maybe_override_with_speculators +(APIServer pid=1) config_dict, _ = PretrainedConfig.get_config_dict( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py", line 662, in get_config_dict +(APIServer pid=1) config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py", line 721, in _get_config_dict +(APIServer pid=1) resolved_config_file = cached_file( +(APIServer pid=1) ^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 322, in cached_file +(APIServer pid=1) file = cached_files(path_or_repo_id=path_or_repo_id, filenames=[filename], **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 553, in cached_files +(APIServer pid=1) raise OSError( +(APIServer pid=1) OSError: We couldn't connect to 'https://huggingface.co' to load the files, and couldn't find them in the cached files. +(APIServer pid=1) Check your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'. diff --git a/accuracy/results/v0.19.0/logs/google-gemma-3-4b-it--h100-80gb--tp1pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/google-gemma-3-4b-it--h100-80gb--tp1pp1dp1--8192.log new file mode 100644 index 00000000..babe20f2 --- /dev/null +++ b/accuracy/results/v0.19.0/logs/google-gemma-3-4b-it--h100-80gb--tp1pp1dp1--8192.log @@ -0,0 +1,766 @@ +DEBUG 04-22 16:01:51 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 16:01:51 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 16:01:51 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 16:01:51 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 16:01:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 16:01:51 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 16:01:51 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 16:01:51 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 16:01:51 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 16:01:51 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 16:01:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 16:01:51 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 16:01:56 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 16:01:58 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' +DEBUG 04-22 16:01:58 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 16:01:58 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 16:01:58 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 16:01:58 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 16:01:58 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 16:01:58 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 16:01:58 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 16:01:58 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model google/gemma-3-4b-it +(APIServer pid=1) INFO 04-22 16:01:58 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 16:01:58 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 16:01:58 [entrypoints/utils.py:233] non-default args: {'model_tag': 'google/gemma-3-4b-it', 'model': 'google/gemma-3-4b-it', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 16:01:58 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 16:01:59 [model_executor/models/registry.py:774] Cached model info file for class vllm.model_executor.models.gemma3_mm.Gemma3ForConditionalGeneration not found +(APIServer pid=1) DEBUG 04-22 16:01:59 [model_executor/models/registry.py:834] Cache model info for class vllm.model_executor.models.gemma3_mm.Gemma3ForConditionalGeneration miss. Loading model instead. +(APIServer pid=1) DEBUG 04-22 16:02:09 [model_executor/models/registry.py:844] Loaded model info for class vllm.model_executor.models.gemma3_mm.Gemma3ForConditionalGeneration +(APIServer pid=1) DEBUG 04-22 16:02:09 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 9.9875990 secs +(APIServer pid=1) INFO 04-22 16:02:09 [config/model.py:549] Resolved architecture: Gemma3ForConditionalGeneration +(APIServer pid=1) INFO 04-22 16:02:09 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 16:02:09 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 16:02:09 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 16:02:09 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 16:02:09 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 16:02:09 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 16:02:09 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 16:02:09 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) WARNING 04-22 16:02:09 [platforms/cuda.py:199] Forcing --disable_chunked_mm_input for models with multimodal-bidirectional attention. +(APIServer pid=1) DEBUG 04-22 16:02:09 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 16:02:09 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 16:02:12 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 16:02:13 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. +(APIServer pid=1) DEBUG 04-22 16:02:13 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +(APIServer pid=1) Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +DEBUG 04-22 16:02:25 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 16:02:25 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 16:02:25 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 16:02:25 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 16:02:25 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 16:02:25 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 16:02:25 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 16:02:25 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 16:02:25 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 16:02:25 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 16:02:25 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 16:02:25 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 16:02:30 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(APIServer pid=1) DEBUG 04-22 16:02:32 [v1/engine/utils.py:1042] Waiting for 1 local, 0 remote core engine proc(s) to connect. +(EngineCore pid=704) DEBUG 04-22 16:02:32 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 16:02:32 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=704) DEBUG 04-22 16:02:32 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/bb3f96a5-e878-4b7a-97bc-6b285c64279c'], outputs=['ipc:///tmp/f5d2b059-3379-4652-91b4-d539a064ea95'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=704) DEBUG 04-22 16:02:32 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=704) DEBUG 04-22 16:02:32 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=704) DEBUG 04-22 16:02:32 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=704) DEBUG 04-22 16:02:32 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=704) DEBUG 04-22 16:02:32 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=704) INFO 04-22 16:02:32 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='google/gemma-3-4b-it', speculative_config=None, tokenizer='google/gemma-3-4b-it', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=google/gemma-3-4b-it, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=704) DEBUG 04-22 16:02:32 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(EngineCore pid=704) DEBUG 04-22 16:02:34 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.10.218:44417 backend=nccl +(EngineCore pid=704) INFO 04-22 16:02:34 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.10.218:44417 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=704) DEBUG 04-22 16:02:34 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=704) INFO 04-22 16:02:34 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=704) DEBUG 04-22 16:02:34 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776873754.7606223, auto_measure=True +(EngineCore pid=704) DEBUG 04-22 16:02:34 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=704) DEBUG 04-22 16:02:34 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=704) DEBUG 04-22 16:02:34 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=704) DEBUG 04-22 16:02:34 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=704) DEBUG 04-22 16:02:34 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=704) DEBUG 04-22 16:02:34 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=704) DEBUG 04-22 16:02:34 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=704) DEBUG 04-22 16:02:34 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. +(EngineCore pid=704) Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. +(APIServer pid=1) DEBUG 04-22 16:02:42 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=704) DEBUG 04-22 16:02:42 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=704) INFO 04-22 16:02:42 [v1/worker/gpu_model_runner.py:4735] Starting to load model google/gemma-3-4b-it... +(EngineCore pid=704) INFO 04-22 16:02:43 [model_executor/models/interfaces.py:171] Contains out of vocabulary multimodal tokens? False +(EngineCore pid=704) INFO 04-22 16:02:43 [platforms/cuda.py:390] Using backend AttentionBackendEnum.FLASH_ATTN for vit attention +(EngineCore pid=704) INFO 04-22 16:02:43 [model_executor/.../attention/mm_encoder_attention.py:230] Using AttentionBackendEnum.FLASH_ATTN for MMEncoderAttention. +(EngineCore pid=704) INFO 04-22 16:02:43 [config/vllm.py:790] Asynchronous scheduling is enabled. +(EngineCore pid=704) DEBUG 04-22 16:02:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=704) DEBUG 04-22 16:02:43 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=256, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=True, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {FLASH_ATTN: [partial multimodal token full attention not supported], FLASHINFER: [partial multimodal token full attention not supported]}. +(EngineCore pid=704) INFO 04-22 16:02:43 [platforms/cuda.py:334] Using TRITON_ATTN attention backend out of potential backends: ['TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=704) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=704) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=704) DEBUG 04-22 16:02:43 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=704) DEBUG 04-22 16:02:43 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 27}) +(EngineCore pid=704) DEBUG 04-22 16:02:43 [config/compilation.py:1195] disabled custom ops: Counter({'gemma_rms_norm': 206, 'gelu_and_mul': 34, 'rotary_embedding': 2, 'apply_rotary_emb': 2, 'conv2d': 1, 'vocab_parallel_embedding': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=704) DEBUG 04-22 16:02:43 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 27}) +(EngineCore pid=704) DEBUG 04-22 16:02:43 [config/compilation.py:1195] disabled custom ops: Counter({'gemma_rms_norm': 206, 'gelu_and_mul': 34, 'rotary_embedding': 2, 'apply_rotary_emb': 2, 'conv2d': 1, 'vocab_parallel_embedding': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=704) DEBUG 04-22 16:02:43 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=704) DEBUG 04-22 16:02:43 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00002.safetensors', 'model-00001-of-00002.safetensors']] +(APIServer pid=1) DEBUG 04-22 16:02:52 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=704) INFO 04-22 16:02:56 [model_executor/model_loader/weight_utils.py:581] Time spent downloading weights for google/gemma-3-4b-it: 12.245518 seconds +(EngineCore pid=704) Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00 +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/gemma3.py +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=704) INFO 04-22 16:03:10 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/51fc3e1252/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=fa200d5147 comp=e546579c48 code=616836617014c5f9fc7251fa87e8f62cc5716448a3f7628492691f84ac2574d9 dir=/data/.cache/vllm/torch_compile_cache/51fc3e1252/rank_0_0/backbone +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] Vllm config hash: fa200d5147 +(EngineCore pid=704) INFO 04-22 16:03:10 [compilation/backends.py:1111] Dynamo bytecode transform time: 5.58 s +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=704) DEBUG 04-22 16:03:11 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=704) DEBUG 04-22 16:03:11 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms +(EngineCore pid=704) DEBUG 04-22 16:03:11 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.2 ms +(EngineCore pid=704) DEBUG 04-22 16:03:11 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=704) DEBUG 04-22 16:03:11 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(APIServer pid=1) DEBUG 04-22 16:03:12 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=704) INFO 04-22 16:03:15 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=704) DEBUG 04-22 16:03:15 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/51fc3e1252/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=704) DEBUG 04-22 16:03:16 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=704) DEBUG 04-22 16:03:16 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(EngineCore pid=704) DEBUG 04-22 16:03:16 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.8 ms +(EngineCore pid=704) DEBUG 04-22 16:03:16 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=704) DEBUG 04-22 16:03:16 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=704) DEBUG 04-22 16:03:18 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/51fc3e1252/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=704) DEBUG 04-22 16:03:19 [compilation/backends.py:377] Store the 2-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_2', '/data/.cache/vllm/torch_compile_cache/51fc3e1252/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_2') +(EngineCore pid=704) DEBUG 04-22 16:03:19 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=704) DEBUG 04-22 16:03:19 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(EngineCore pid=704) DEBUG 04-22 16:03:19 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.7 ms +(EngineCore pid=704) DEBUG 04-22 16:03:19 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=704) DEBUG 04-22 16:03:19 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=704) DEBUG 04-22 16:03:22 [compilation/backends.py:377] Store the 5-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_5', '/data/.cache/vllm/torch_compile_cache/51fc3e1252/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_5') +(APIServer pid=1) DEBUG 04-22 16:03:22 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=704) DEBUG 04-22 16:03:22 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=704) DEBUG 04-22 16:03:22 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.2 ms +(EngineCore pid=704) DEBUG 04-22 16:03:22 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.4 ms +(EngineCore pid=704) DEBUG 04-22 16:03:22 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=704) DEBUG 04-22 16:03:22 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(EngineCore pid=704) DEBUG 04-22 16:03:23 [compilation/backends.py:377] Store the 34-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_34', '/data/.cache/vllm/torch_compile_cache/51fc3e1252/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_34') +(EngineCore pid=704) INFO 04-22 16:03:23 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 12.51 s +(EngineCore pid=704) DEBUG 04-22 16:03:23 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/51fc3e1252/rank_0_0/backbone/computation_graph.py +(EngineCore pid=704) INFO 04-22 16:03:25 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/bf6f8b734858fcb15213f9ee93845ad6e23d10da81c12f4ce6ecf561a8822c30/rank_0_0/model +(EngineCore pid=704) INFO 04-22 16:03:25 [compilation/monitor.py:48] torch.compile took 20.04 s in total +(EngineCore pid=704) INFO 04-22 16:03:25 [compilation/monitor.py:76] Initial profiling/warmup run took 0.54 s +(EngineCore pid=704) WARNING 04-22 16:03:31 [v1/core/kv_cache_utils.py:1059] Add 1 padding layers, may waste at most 3.45% KV cache memory +(EngineCore pid=704) INFO 04-22 16:03:31 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=704) DEBUG 04-22 16:03:31 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=704) INFO 04-22 16:03:31 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=704) DEBUG 04-22 16:03:31 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=704) DEBUG 04-22 16:03:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=704) DEBUG 04-22 16:03:32 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=704) DEBUG 04-22 16:03:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=704) DEBUG 04-22 16:03:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=704) DEBUG 04-22 16:03:32 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=704) DEBUG 04-22 16:03:32 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 112.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=704) DEBUG 04-22 16:03:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(APIServer pid=1) DEBUG 04-22 16:03:32 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=704) DEBUG 04-22 16:03:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=704) DEBUG 04-22 16:03:33 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=704) DEBUG 04-22 16:03:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=704) DEBUG 04-22 16:03:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=704) DEBUG 04-22 16:03:33 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=704) DEBUG 04-22 16:03:33 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 4.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=704) DEBUG 04-22 16:03:33 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=704) INFO 04-22 16:03:33 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.70 GiB total +(EngineCore pid=704) DEBUG 04-22 16:03:34 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=704) DEBUG 04-22 16:03:34 [v1/worker/gpu_worker.py:430] Free memory after profiling: 69.27 GiB (total), 65.82 GiB (within requested) +(EngineCore pid=704) DEBUG 04-22 16:03:34 [v1/worker/gpu_worker.py:435] Memory profiling takes 29.72 seconds. Total non KV cache memory: 12.72GiB; torch peak memory increase: 3.89GiB; non-torch forward increase memory: 0.25GiB; weights memory: 8.58GiB. +(EngineCore pid=704) INFO 04-22 16:03:34 [v1/worker/gpu_worker.py:436] Available KV cache memory: 62.51 GiB +(EngineCore pid=704) INFO 04-22 16:03:34 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9588 to maintain the same effective KV cache size. +(EngineCore pid=704) WARNING 04-22 16:03:34 [v1/core/kv_cache_utils.py:1059] Add 1 padding layers, may waste at most 3.45% KV cache memory +(EngineCore pid=704) INFO 04-22 16:03:34 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 468,144 tokens +(EngineCore pid=704) INFO 04-22 16:03:34 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 57.05x +(EngineCore pid=704) 2026-04-22 16:03:34,271 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=704) DEBUG 04-22 16:03:34 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=704) 2026-04-22 16:03:34,282 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=704) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-23 01:03:13 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-23 01:03:13 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-23 01:03:13 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-23 01:03:13 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-23 01:03:13 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-23 01:03:13 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model google/gemma-4-E4B-it +(APIServer pid=1) INFO 04-23 01:03:13 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-23 01:03:13 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-23 01:03:13 [entrypoints/utils.py:233] non-default args: {'model_tag': 'google/gemma-4-E4B-it', 'model': 'google/gemma-4-E4B-it', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-23 01:03:13 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) Traceback (most recent call last): +(APIServer pid=1) File "/usr/local/bin/vllm", line 10, in +(APIServer pid=1) sys.exit(main()) +(APIServer pid=1) ^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main +(APIServer pid=1) args.dispatch_function(args) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd +(APIServer pid=1) uvloop.run(run_server(args)) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run +(APIServer pid=1) return __asyncio.run( +(APIServer pid=1) ^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run +(APIServer pid=1) return runner.run(main) +(APIServer pid=1) ^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run +(APIServer pid=1) return self._loop.run_until_complete(task) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper +(APIServer pid=1) return await main +(APIServer pid=1) ^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server +(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker +(APIServer pid=1) async with build_async_engine_client( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ +(APIServer pid=1) return await anext(self.gen) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client +(APIServer pid=1) async with build_async_engine_client_from_engine_args( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ +(APIServer pid=1) return await anext(self.gen) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 124, in build_async_engine_client_from_engine_args +(APIServer pid=1) vllm_config = engine_args.create_engine_config(usage_context=usage_context) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/engine/arg_utils.py", line 1549, in create_engine_config +(APIServer pid=1) model_config = self.create_model_config() +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/engine/arg_utils.py", line 1398, in create_model_config +(APIServer pid=1) return ModelConfig( +(APIServer pid=1) ^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/pydantic/_internal/_dataclasses.py", line 121, in __init__ +(APIServer pid=1) s.__pydantic_validator__.validate_python(ArgsKwargs(args, kwargs), self_instance=s) +(APIServer pid=1) pydantic_core._pydantic_core.ValidationError: 1 validation error for ModelConfig +(APIServer pid=1) Value error, The checkpoint you are trying to load has model type `gemma4` but Transformers does not recognize this architecture. This could be because of an issue with the checkpoint, or because your version of Transformers is out of date. +(APIServer pid=1) +(APIServer pid=1) You can update Transformers with the command `pip install --upgrade transformers`. If this does not work, and the checkpoint is very new, then there may not be a release version that supports this model yet. In this case, you can get the most up-to-date code by installing Transformers from source with the command `pip install git+https://github.com/huggingface/transformers.git` [type=value_error, input_value=ArgsKwargs((), {'model': ...nderer_num_workers': 1}), input_type=ArgsKwargs] +(APIServer pid=1) For further information visit https://errors.pydantic.dev/2.12/v/value_error diff --git a/accuracy/results/v0.19.0/logs/google-gemma-7b--h100-80gb--tp1pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/google-gemma-7b--h100-80gb--tp1pp1dp1--8192.log new file mode 100644 index 00000000..c939bdb3 --- /dev/null +++ b/accuracy/results/v0.19.0/logs/google-gemma-7b--h100-80gb--tp1pp1dp1--8192.log @@ -0,0 +1,776 @@ +DEBUG 04-22 19:55:20 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 19:55:20 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 19:55:20 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 19:55:20 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 19:55:20 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 19:55:20 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 19:55:20 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 19:55:20 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 19:55:20 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 19:55:20 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 19:55:20 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 19:55:20 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 19:55:25 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 19:55:27 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' +DEBUG 04-22 19:55:27 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 19:55:27 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 19:55:27 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 19:55:27 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 19:55:27 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 19:55:27 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 19:55:27 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 19:55:27 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model google/gemma-7b +(APIServer pid=1) INFO 04-22 19:55:27 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 19:55:27 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 19:55:27 [entrypoints/utils.py:233] non-default args: {'model_tag': 'google/gemma-7b', 'model': 'google/gemma-7b', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 19:55:27 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 19:55:28 [model_executor/models/registry.py:774] Cached model info file for class vllm.model_executor.models.gemma.GemmaForCausalLM not found +(APIServer pid=1) DEBUG 04-22 19:55:28 [model_executor/models/registry.py:834] Cache model info for class vllm.model_executor.models.gemma.GemmaForCausalLM miss. Loading model instead. +(APIServer pid=1) DEBUG 04-22 19:55:37 [model_executor/models/registry.py:844] Loaded model info for class vllm.model_executor.models.gemma.GemmaForCausalLM +(APIServer pid=1) DEBUG 04-22 19:55:38 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 9.7641226 secs +(APIServer pid=1) INFO 04-22 19:55:38 [config/model.py:549] Resolved architecture: GemmaForCausalLM +(APIServer pid=1) INFO 04-22 19:55:38 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 19:55:38 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 19:55:38 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 19:55:38 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 19:55:38 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 19:55:38 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 19:55:38 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 19:55:38 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 19:55:38 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 19:55:38 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 19:55:42 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 19:55:42 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 19:55:45 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 19:55:45 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 19:55:45 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 19:55:45 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 19:55:45 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 19:55:45 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 19:55:45 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 19:55:45 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 19:55:45 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 19:55:45 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 19:55:45 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 19:55:45 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 19:55:50 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(APIServer pid=1) DEBUG 04-22 19:55:52 [v1/engine/utils.py:1042] Waiting for 1 local, 0 remote core engine proc(s) to connect. +(EngineCore pid=476) DEBUG 04-22 19:55:52 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 19:55:52 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=476) DEBUG 04-22 19:55:52 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/ffb07a5d-fa70-4ff0-a712-25f0afaf7ecc'], outputs=['ipc:///tmp/aae2e87c-8846-4ae1-96f9-87bec80aa3ae'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=476) DEBUG 04-22 19:55:52 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=476) DEBUG 04-22 19:55:52 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=476) DEBUG 04-22 19:55:52 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=476) DEBUG 04-22 19:55:52 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=476) DEBUG 04-22 19:55:52 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=476) INFO 04-22 19:55:52 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='google/gemma-7b', speculative_config=None, tokenizer='google/gemma-7b', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=google/gemma-7b, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=476) DEBUG 04-22 19:55:52 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.130.3.159:52289 backend=nccl +(EngineCore pid=476) INFO 04-22 19:55:52 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.130.3.159:52289 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=476) DEBUG 04-22 19:55:52 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=476) INFO 04-22 19:55:52 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=476) DEBUG 04-22 19:55:53 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776887753.2019594, auto_measure=True +(EngineCore pid=476) DEBUG 04-22 19:55:53 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=476) DEBUG 04-22 19:55:53 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=476) DEBUG 04-22 19:55:53 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=476) DEBUG 04-22 19:55:53 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=476) DEBUG 04-22 19:55:53 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=476) DEBUG 04-22 19:55:53 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=476) DEBUG 04-22 19:55:53 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=476) DEBUG 04-22 19:55:53 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=476) DEBUG 04-22 19:55:53 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=476) INFO 04-22 19:55:53 [v1/worker/gpu_model_runner.py:4735] Starting to load model google/gemma-7b... +(EngineCore pid=476) DEBUG 04-22 19:55:54 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=256, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=476) INFO 04-22 19:55:54 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=476) INFO 04-22 19:55:54 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=476) WARNING 04-22 19:55:54 [model_executor/models/gemma.py:67] Gemma's activation function was incorrectly set to exact GeLU in the config JSON file when it was initially released. Changing the activation function to approximate GeLU (`gelu_pytorch_tanh`). If you want to use the legacy `gelu`, edit the config JSON to set `hidden_activation=gelu` instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details. +(EngineCore pid=476) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=476) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=476) DEBUG 04-22 19:55:54 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=476) DEBUG 04-22 19:55:54 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=476) DEBUG 04-22 19:55:54 [config/compilation.py:1195] disabled custom ops: Counter({'gemma_rms_norm': 57, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'gelu_and_mul': 1, 'logits_processor': 1}) +(EngineCore pid=476) DEBUG 04-22 19:55:54 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=476) DEBUG 04-22 19:55:54 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00004-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'model-00002-of-00004.safetensors']] +(APIServer pid=1) DEBUG 04-22 19:56:02 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 19:56:12 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 19:56:22 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 19:56:32 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 19:56:42 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 19:56:52 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 19:57:02 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 19:57:12 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 19:57:22 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=476) INFO 04-22 19:57:28 [model_executor/model_loader/weight_utils.py:581] Time spent downloading weights for google/gemma-7b: 94.085020 seconds +(EngineCore pid=476) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/gemma.py +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=476) INFO 04-22 19:57:47 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/3e4d7bd8e6/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=0835f40ede comp=e546579c48 code=d159a3198a8d5227bb8beeacc16de5095b5713eb9baa852b8ce11e4d03b46410 dir=/data/.cache/vllm/torch_compile_cache/3e4d7bd8e6/rank_0_0/backbone +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] Vllm config hash: 0835f40ede +(EngineCore pid=476) INFO 04-22 19:57:47 [compilation/backends.py:1111] Dynamo bytecode transform time: 3.44 s +(EngineCore pid=476) DEBUG 04-22 19:57:48 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=476) DEBUG 04-22 19:57:48 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=476) DEBUG 04-22 19:57:48 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=476) DEBUG 04-22 19:57:48 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(EngineCore pid=476) DEBUG 04-22 19:57:48 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.7 ms +(EngineCore pid=476) DEBUG 04-22 19:57:48 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=476) DEBUG 04-22 19:57:48 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=476) INFO 04-22 19:57:49 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=476) DEBUG 04-22 19:57:49 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/3e4d7bd8e6/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=476) DEBUG 04-22 19:57:49 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=476) DEBUG 04-22 19:57:49 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(EngineCore pid=476) DEBUG 04-22 19:57:49 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.3 ms +(EngineCore pid=476) DEBUG 04-22 19:57:49 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=476) DEBUG 04-22 19:57:49 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=476) DEBUG 04-22 19:57:50 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/3e4d7bd8e6/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=476) DEBUG 04-22 19:57:51 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=476) DEBUG 04-22 19:57:51 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.1 ms +(EngineCore pid=476) DEBUG 04-22 19:57:51 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.2 ms +(EngineCore pid=476) DEBUG 04-22 19:57:51 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=476) DEBUG 04-22 19:57:51 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(EngineCore pid=476) DEBUG 04-22 19:57:51 [compilation/backends.py:377] Store the 28-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_28', '/data/.cache/vllm/torch_compile_cache/3e4d7bd8e6/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_28') +(EngineCore pid=476) INFO 04-22 19:57:51 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 3.92 s +(EngineCore pid=476) DEBUG 04-22 19:57:52 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/3e4d7bd8e6/rank_0_0/backbone/computation_graph.py +(APIServer pid=1) DEBUG 04-22 19:57:52 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=476) INFO 04-22 19:57:53 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/cfd90f7b6e9eca63e8913919f609c7f6cdef60949c23277267eda847f1fcac03/rank_0_0/model +(EngineCore pid=476) INFO 04-22 19:57:53 [compilation/monitor.py:48] torch.compile took 8.99 s in total +(EngineCore pid=476) INFO 04-22 19:57:53 [compilation/monitor.py:76] Initial profiling/warmup run took 0.52 s +(EngineCore pid=476) INFO 04-22 19:57:59 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=476) DEBUG 04-22 19:57:59 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=476) INFO 04-22 19:57:59 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=476) DEBUG 04-22 19:57:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=476) DEBUG 04-22 19:57:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=476) DEBUG 04-22 19:57:59 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=476) DEBUG 04-22 19:57:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=476) DEBUG 04-22 19:57:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=476) DEBUG 04-22 19:57:59 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=476) DEBUG 04-22 19:57:59 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 172.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=476) DEBUG 04-22 19:57:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=476) DEBUG 04-22 19:57:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=476) DEBUG 04-22 19:57:59 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=476) DEBUG 04-22 19:57:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=476) DEBUG 04-22 19:57:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=476) DEBUG 04-22 19:57:59 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=476) DEBUG 04-22 19:57:59 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 262.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=476) DEBUG 04-22 19:57:59 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=476) INFO 04-22 19:57:59 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.84 GiB total +(EngineCore pid=476) DEBUG 04-22 19:58:00 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=476) DEBUG 04-22 19:58:00 [v1/worker/gpu_worker.py:430] Free memory after profiling: 60.94 GiB (total), 57.5 GiB (within requested) +(EngineCore pid=476) DEBUG 04-22 19:58:00 [v1/worker/gpu_worker.py:435] Memory profiling takes 16.01 seconds. Total non KV cache memory: 19.79GiB; torch peak memory increase: 3.63GiB; non-torch forward increase memory: 0.25GiB; weights memory: 15.91GiB. +(EngineCore pid=476) INFO 04-22 19:58:00 [v1/worker/gpu_worker.py:436] Available KV cache memory: 55.44 GiB +(EngineCore pid=476) INFO 04-22 19:58:00 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9606 to maintain the same effective KV cache size. +(EngineCore pid=476) INFO 04-22 19:58:00 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 129,760 tokens +(EngineCore pid=476) INFO 04-22 19:58:00 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 15.84x +(EngineCore pid=476) 2026-04-22 19:58:00,118 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=476) DEBUG 04-22 19:58:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=476) 2026-04-22 19:58:00,126 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=476) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:47:43 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:47:43 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 00:47:43 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:47:43 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 00:47:43 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 00:47:43 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model ibm-granite/granite-3.1-2b-instruct +(APIServer pid=1) INFO 04-22 00:47:43 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 00:47:43 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:47:43 [entrypoints/utils.py:233] non-default args: {'model_tag': 'ibm-granite/granite-3.1-2b-instruct', 'model': 'ibm-granite/granite-3.1-2b-instruct', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 00:47:43 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 00:47:43 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.granite.GraniteForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 00:47:43 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0014069 secs +(APIServer pid=1) INFO 04-22 00:47:43 [config/model.py:549] Resolved architecture: GraniteForCausalLM +(APIServer pid=1) INFO 04-22 00:47:43 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 00:47:43 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 00:47:43 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 00:47:43 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 00:47:43 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 00:47:43 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 00:47:43 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 00:47:43 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 00:47:43 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 00:47:43 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:47:43 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:47:43 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 00:47:47 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:47:47 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:47:47 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:47:47 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:47:47 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:47:47 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:47:47 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:47:47 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:47:47 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:47:47 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:47:47 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:47:47 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:47:52 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=241) DEBUG 04-22 00:47:53 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 00:47:53 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=241) DEBUG 04-22 00:47:53 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/0349d557-8ab3-47d9-9ad8-c814627504f4'], outputs=['ipc:///tmp/b76a23fe-76f6-4c98-b58e-8cebef12265e'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=241) DEBUG 04-22 00:47:53 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=241) DEBUG 04-22 00:47:53 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=241) DEBUG 04-22 00:47:53 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=241) DEBUG 04-22 00:47:53 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=241) DEBUG 04-22 00:47:53 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=241) INFO 04-22 00:47:53 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='ibm-granite/granite-3.1-2b-instruct', speculative_config=None, tokenizer='ibm-granite/granite-3.1-2b-instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=ibm-granite/granite-3.1-2b-instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=241) DEBUG 04-22 00:47:54 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.131.7.80:51203 backend=nccl +(EngineCore pid=241) INFO 04-22 00:47:54 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.131.7.80:51203 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=241) DEBUG 04-22 00:47:54 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=241) INFO 04-22 00:47:54 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=241) DEBUG 04-22 00:47:54 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776818874.7326565, auto_measure=True +(EngineCore pid=241) DEBUG 04-22 00:47:54 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=241) DEBUG 04-22 00:47:54 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=241) DEBUG 04-22 00:47:54 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=241) DEBUG 04-22 00:47:54 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=241) DEBUG 04-22 00:47:54 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=241) DEBUG 04-22 00:47:54 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=241) DEBUG 04-22 00:47:54 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=241) DEBUG 04-22 00:47:54 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=241) DEBUG 04-22 00:47:54 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=241) INFO 04-22 00:47:54 [v1/worker/gpu_model_runner.py:4735] Starting to load model ibm-granite/granite-3.1-2b-instruct... +(EngineCore pid=241) DEBUG 04-22 00:47:55 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=64, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=241) INFO 04-22 00:47:55 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=241) INFO 04-22 00:47:55 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=241) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=241) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=241) DEBUG 04-22 00:47:55 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=241) DEBUG 04-22 00:47:55 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=241) DEBUG 04-22 00:47:55 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 81, 'silu_and_mul': 40, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=241) DEBUG 04-22 00:47:55 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=241) DEBUG 04-22 00:47:55 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00001-of-00002.safetensors', 'model-00002-of-00002.safetensors']] +(EngineCore pid=241) Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00 +(APIServer pid=1) DEBUG 04-22 00:48:03 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/granite.py +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=241) INFO 04-22 00:48:05 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/2cc6b1f3b1/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=bf3c110cf4 comp=e546579c48 code=9615bc1b6a3cdadf99f40f12188b347f2842282ef159341e824808308c14a2aa dir=/data/.cache/vllm/torch_compile_cache/2cc6b1f3b1/rank_0_0/backbone +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] Vllm config hash: bf3c110cf4 +(EngineCore pid=241) INFO 04-22 00:48:05 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.92 s +(EngineCore pid=241) DEBUG 04-22 00:48:06 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=241) DEBUG 04-22 00:48:06 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=241) DEBUG 04-22 00:48:06 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=241) DEBUG 04-22 00:48:06 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(EngineCore pid=241) DEBUG 04-22 00:48:06 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms +(EngineCore pid=241) DEBUG 04-22 00:48:06 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=241) DEBUG 04-22 00:48:06 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=241) INFO 04-22 00:48:08 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=241) DEBUG 04-22 00:48:08 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/2cc6b1f3b1/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=241) DEBUG 04-22 00:48:08 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=241) DEBUG 04-22 00:48:08 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(EngineCore pid=241) DEBUG 04-22 00:48:08 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms +(EngineCore pid=241) DEBUG 04-22 00:48:08 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=241) DEBUG 04-22 00:48:08 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=241) DEBUG 04-22 00:48:09 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/2cc6b1f3b1/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=241) DEBUG 04-22 00:48:11 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=241) DEBUG 04-22 00:48:11 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.0 ms +(EngineCore pid=241) DEBUG 04-22 00:48:11 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms +(EngineCore pid=241) DEBUG 04-22 00:48:11 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=241) DEBUG 04-22 00:48:11 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(EngineCore pid=241) DEBUG 04-22 00:48:11 [compilation/backends.py:377] Store the 40-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_40', '/data/.cache/vllm/torch_compile_cache/2cc6b1f3b1/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_40') +(EngineCore pid=241) INFO 04-22 00:48:11 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.36 s +(EngineCore pid=241) DEBUG 04-22 00:48:11 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/2cc6b1f3b1/rank_0_0/backbone/computation_graph.py +(EngineCore pid=241) INFO 04-22 00:48:12 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/d5e5b1f5dbb9fccea6e0b08fa936f7a8d04b152a219e5cf2027018b7be736bf9/rank_0_0/model +(EngineCore pid=241) INFO 04-22 00:48:12 [compilation/monitor.py:48] torch.compile took 11.58 s in total +(EngineCore pid=241) INFO 04-22 00:48:12 [compilation/monitor.py:76] Initial profiling/warmup run took 0.18 s +(APIServer pid=1) DEBUG 04-22 00:48:13 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=241) INFO 04-22 00:48:21 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=241) DEBUG 04-22 00:48:21 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=241) INFO 04-22 00:48:21 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=241) DEBUG 04-22 00:48:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=241) DEBUG 04-22 00:48:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=241) DEBUG 04-22 00:48:21 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=241) DEBUG 04-22 00:48:22 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=241) DEBUG 04-22 00:48:22 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=241) DEBUG 04-22 00:48:22 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=241) DEBUG 04-22 00:48:22 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 100.00 MiB first-capture + (51-1) × 26.00 MiB per-graph +(EngineCore pid=241) DEBUG 04-22 00:48:22 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=241) DEBUG 04-22 00:48:22 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=241) DEBUG 04-22 00:48:22 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=241) DEBUG 04-22 00:48:22 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=241) DEBUG 04-22 00:48:22 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=241) DEBUG 04-22 00:48:22 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=241) DEBUG 04-22 00:48:22 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 134.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(EngineCore pid=241) DEBUG 04-22 00:48:22 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=241) INFO 04-22 00:48:22 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.60 GiB total +(EngineCore pid=241) DEBUG 04-22 00:48:23 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=241) DEBUG 04-22 00:48:23 [v1/worker/gpu_worker.py:430] Free memory after profiling: 73.15 GiB (total), 69.7 GiB (within requested) +(EngineCore pid=241) DEBUG 04-22 00:48:23 [v1/worker/gpu_worker.py:435] Memory profiling takes 22.35 seconds. Total non KV cache memory: 5.95GiB; torch peak memory increase: 0.75GiB; non-torch forward increase memory: 0.46GiB; weights memory: 4.74GiB. +(EngineCore pid=241) INFO 04-22 00:48:23 [v1/worker/gpu_worker.py:436] Available KV cache memory: 69.28 GiB +(EngineCore pid=241) INFO 04-22 00:48:23 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9702 to maintain the same effective KV cache size. +(EngineCore pid=241) INFO 04-22 00:48:23 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 908,048 tokens +(EngineCore pid=241) INFO 04-22 00:48:23 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 110.85x +(EngineCore pid=241) 2026-04-22 00:48:23,286 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=241) DEBUG 04-22 00:48:23 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=241) 2026-04-22 00:48:23,300 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=241) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:48:47 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:48:47 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 00:48:47 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:48:47 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 00:48:47 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 00:48:47 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model ibm-granite/granite-3.1-8b-instruct +(APIServer pid=1) INFO 04-22 00:48:47 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 00:48:47 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:48:47 [entrypoints/utils.py:233] non-default args: {'model_tag': 'ibm-granite/granite-3.1-8b-instruct', 'model': 'ibm-granite/granite-3.1-8b-instruct', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 00:48:47 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 00:48:48 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.granite.GraniteForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 00:48:48 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0010872 secs +(APIServer pid=1) INFO 04-22 00:48:48 [config/model.py:549] Resolved architecture: GraniteForCausalLM +(APIServer pid=1) INFO 04-22 00:48:48 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 00:48:48 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 00:48:48 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 00:48:48 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 00:48:48 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 00:48:48 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 00:48:48 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 00:48:48 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 00:48:48 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 00:48:48 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:48:48 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:48:48 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 00:48:52 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:48:52 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:48:52 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:48:52 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:48:52 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:48:52 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:48:52 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:48:52 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:48:52 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:48:52 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:48:52 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:48:52 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:48:57 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=243) DEBUG 04-22 00:48:58 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 00:48:58 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=243) DEBUG 04-22 00:48:58 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/fa12e5fc-6b20-4e7a-b9ac-6e0b03ec1faf'], outputs=['ipc:///tmp/41c5cdb6-ddb7-45b0-84ef-bb868b12ef8f'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=243) DEBUG 04-22 00:48:58 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=243) DEBUG 04-22 00:48:58 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=243) DEBUG 04-22 00:48:58 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=243) DEBUG 04-22 00:48:58 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=243) DEBUG 04-22 00:48:58 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=243) INFO 04-22 00:48:58 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='ibm-granite/granite-3.1-8b-instruct', speculative_config=None, tokenizer='ibm-granite/granite-3.1-8b-instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=ibm-granite/granite-3.1-8b-instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=243) DEBUG 04-22 00:48:59 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.7.31:48457 backend=nccl +(EngineCore pid=243) INFO 04-22 00:48:59 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.7.31:48457 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=243) DEBUG 04-22 00:48:59 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=243) INFO 04-22 00:48:59 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=243) DEBUG 04-22 00:48:59 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776818939.6237643, auto_measure=True +(EngineCore pid=243) DEBUG 04-22 00:48:59 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=243) DEBUG 04-22 00:48:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=243) DEBUG 04-22 00:48:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=243) DEBUG 04-22 00:48:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=243) DEBUG 04-22 00:48:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=243) DEBUG 04-22 00:48:59 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=243) DEBUG 04-22 00:48:59 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=243) DEBUG 04-22 00:48:59 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=243) DEBUG 04-22 00:48:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=243) INFO 04-22 00:48:59 [v1/worker/gpu_model_runner.py:4735] Starting to load model ibm-granite/granite-3.1-8b-instruct... +(EngineCore pid=243) DEBUG 04-22 00:49:00 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=243) INFO 04-22 00:49:00 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=243) INFO 04-22 00:49:00 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=243) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=243) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=243) DEBUG 04-22 00:49:00 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=243) DEBUG 04-22 00:49:00 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=243) DEBUG 04-22 00:49:00 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 81, 'silu_and_mul': 40, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=243) DEBUG 04-22 00:49:00 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=243) DEBUG 04-22 00:49:00 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00001-of-00004.safetensors', 'model-00002-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00003-of-00004.safetensors']] +(EngineCore pid=243) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 +(APIServer pid=1) DEBUG 04-22 00:49:18 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/granite.py +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=243) INFO 04-22 00:49:20 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/accd715892/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=9678c869f9 comp=e546579c48 code=9615bc1b6a3cdadf99f40f12188b347f2842282ef159341e824808308c14a2aa dir=/data/.cache/vllm/torch_compile_cache/accd715892/rank_0_0/backbone +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] Vllm config hash: 9678c869f9 +(EngineCore pid=243) INFO 04-22 00:49:20 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.97 s +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=243) DEBUG 04-22 00:49:21 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 00:49:21 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(EngineCore pid=243) DEBUG 04-22 00:49:21 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms +(EngineCore pid=243) DEBUG 04-22 00:49:21 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 00:49:21 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=243) INFO 04-22 00:49:23 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=243) DEBUG 04-22 00:49:23 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/accd715892/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=243) DEBUG 04-22 00:49:23 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 00:49:23 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(EngineCore pid=243) DEBUG 04-22 00:49:23 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms +(EngineCore pid=243) DEBUG 04-22 00:49:23 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 00:49:23 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=243) DEBUG 04-22 00:49:24 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/accd715892/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=243) DEBUG 04-22 00:49:25 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 00:49:25 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.0 ms +(EngineCore pid=243) DEBUG 04-22 00:49:25 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.0 ms +(EngineCore pid=243) DEBUG 04-22 00:49:25 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 00:49:25 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(EngineCore pid=243) DEBUG 04-22 00:49:26 [compilation/backends.py:377] Store the 40-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_40', '/data/.cache/vllm/torch_compile_cache/accd715892/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_40') +(EngineCore pid=243) INFO 04-22 00:49:26 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.82 s +(EngineCore pid=243) DEBUG 04-22 00:49:26 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/accd715892/rank_0_0/backbone/computation_graph.py +(EngineCore pid=243) INFO 04-22 00:49:27 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/3fa591bfa994b8ea54e988e8287be9c45f124a0ed4b24d35f98d24f4e34631fa/rank_0_0/model +(EngineCore pid=243) INFO 04-22 00:49:27 [compilation/monitor.py:48] torch.compile took 12.08 s in total +(EngineCore pid=243) INFO 04-22 00:49:28 [compilation/monitor.py:76] Initial profiling/warmup run took 0.48 s +(APIServer pid=1) DEBUG 04-22 00:49:28 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=243) INFO 04-22 00:49:36 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=243) DEBUG 04-22 00:49:36 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=243) INFO 04-22 00:49:36 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=243) DEBUG 04-22 00:49:37 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:49:37 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:49:37 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 00:49:37 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:49:37 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:49:37 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 00:49:37 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 124.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=243) DEBUG 04-22 00:49:37 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:49:37 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:49:37 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 00:49:37 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:49:37 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 00:49:37 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 00:49:37 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 262.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(EngineCore pid=243) DEBUG 04-22 00:49:37 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=243) INFO 04-22 00:49:37 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.74 GiB total +(EngineCore pid=243) DEBUG 04-22 00:49:37 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=243) DEBUG 04-22 00:49:37 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.7 GiB (total), 59.26 GiB (within requested) +(EngineCore pid=243) DEBUG 04-22 00:49:37 [v1/worker/gpu_worker.py:435] Memory profiling takes 22.42 seconds. Total non KV cache memory: 16.57GiB; torch peak memory increase: 0.85GiB; non-torch forward increase memory: 0.46GiB; weights memory: 15.25GiB. +(EngineCore pid=243) INFO 04-22 00:49:37 [v1/worker/gpu_worker.py:436] Available KV cache memory: 58.66 GiB +(EngineCore pid=243) INFO 04-22 00:49:37 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9594 to maintain the same effective KV cache size. +(EngineCore pid=243) INFO 04-22 00:49:37 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 384,432 tokens +(EngineCore pid=243) INFO 04-22 00:49:37 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 46.93x +(EngineCore pid=243) 2026-04-22 00:49:38,019 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=243) DEBUG 04-22 00:49:38 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) 2026-04-22 00:49:38,031 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=243) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:54:08 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:54:08 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 01:54:08 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:54:08 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 01:54:08 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 01:54:08 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-3.1-8B-Instruct +(APIServer pid=1) INFO 04-22 01:54:08 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 01:54:08 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:54:08 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-3.1-8B-Instruct', 'model': 'meta-llama/Llama-3.1-8B-Instruct', 'max_model_len': 8192, 'quantization': 'fp8', 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 01:54:08 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 01:54:08 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 01:54:08 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0024651 secs +(APIServer pid=1) INFO 04-22 01:54:08 [config/model.py:549] Resolved architecture: LlamaForCausalLM +(APIServer pid=1) INFO 04-22 01:54:08 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 01:54:08 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 01:54:08 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 01:54:08 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 01:54:08 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 01:54:08 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 01:54:08 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 01:54:09 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 01:54:09 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 01:54:09 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:54:09 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:54:09 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 01:54:13 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:54:13 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:54:13 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:54:13 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:54:13 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:54:13 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:54:13 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:54:13 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:54:13 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:54:13 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:54:13 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:54:13 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:54:18 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=245) DEBUG 04-22 01:54:19 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 01:54:19 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=245) DEBUG 04-22 01:54:19 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/dee96d97-f77e-444a-b32d-83078a8149a7'], outputs=['ipc:///tmp/0b541461-0bd5-4e24-b152-548bdd8dc3ad'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=245) DEBUG 04-22 01:54:19 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=245) DEBUG 04-22 01:54:19 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=245) DEBUG 04-22 01:54:19 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=245) DEBUG 04-22 01:54:19 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=245) DEBUG 04-22 01:54:19 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=245) INFO 04-22 01:54:19 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=fp8, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=245) DEBUG 04-22 01:54:20 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.43:46413 backend=nccl +(EngineCore pid=245) INFO 04-22 01:54:20 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.43:46413 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=245) DEBUG 04-22 01:54:20 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=245) INFO 04-22 01:54:20 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=245) DEBUG 04-22 01:54:20 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776822860.681496, auto_measure=True +(EngineCore pid=245) DEBUG 04-22 01:54:20 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=245) DEBUG 04-22 01:54:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=245) DEBUG 04-22 01:54:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=245) DEBUG 04-22 01:54:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=245) DEBUG 04-22 01:54:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=245) DEBUG 04-22 01:54:20 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=245) DEBUG 04-22 01:54:20 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=245) DEBUG 04-22 01:54:20 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=245) INFO 04-22 01:54:20 [v1/worker/gpu_model_runner.py:4735] Starting to load model meta-llama/Llama-3.1-8B-Instruct... +(EngineCore pid=245) INFO 04-22 01:54:21 [model_executor/.../linear/__init__.py:261] Selected CutlassFP8ScaledMMLinearKernel for Fp8OnlineLinearMethod +(EngineCore pid=245) INFO 04-22 01:54:21 [utils/deep_gemm.py:115] DeepGEMM E8M0 enabled on current platform. +(EngineCore pid=245) DEBUG 04-22 01:54:21 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=245) INFO 04-22 01:54:21 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=245) INFO 04-22 01:54:21 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=245) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=245) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=245) DEBUG 04-22 01:54:21 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=245) DEBUG 04-22 01:54:21 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=245) DEBUG 04-22 01:54:21 [config/compilation.py:1195] disabled custom ops: Counter({'quant_fp8': 128, 'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=245) DEBUG 04-22 01:54:21 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=245) DEBUG 04-22 01:54:22 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00004-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00002-of-00004.safetensors', 'model-00001-of-00004.safetensors']] +(EngineCore pid=245) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 +(APIServer pid=1) DEBUG 04-22 01:54:39 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/fp8.py +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/input_quant_fp8.py +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/utils/quant_utils.py +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=245) INFO 04-22 01:54:42 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/b8b812d5c5/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=a909408016 comp=e546579c48 code=5379d6d5677d7cdb4c84e4840e951dc1c2c3978435fcf06c17411f78c54e9030 dir=/data/.cache/vllm/torch_compile_cache/b8b812d5c5/rank_0_0/backbone +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] Vllm config hash: a909408016 +(EngineCore pid=245) INFO 04-22 01:54:42 [compilation/backends.py:1111] Dynamo bytecode transform time: 7.27 s +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=245) DEBUG 04-22 01:54:43 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=245) DEBUG 04-22 01:54:43 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(EngineCore pid=245) DEBUG 04-22 01:54:43 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.3 ms +(EngineCore pid=245) DEBUG 04-22 01:54:43 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=245) DEBUG 04-22 01:54:43 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=245) INFO 04-22 01:54:45 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=245) DEBUG 04-22 01:54:45 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/b8b812d5c5/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=245) DEBUG 04-22 01:54:46 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=245) DEBUG 04-22 01:54:46 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(EngineCore pid=245) DEBUG 04-22 01:54:46 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.9 ms +(EngineCore pid=245) DEBUG 04-22 01:54:46 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=245) DEBUG 04-22 01:54:46 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(EngineCore pid=245) DEBUG 04-22 01:54:48 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/b8b812d5c5/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=245) DEBUG 04-22 01:54:49 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=245) DEBUG 04-22 01:54:49 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.4 ms +(EngineCore pid=245) DEBUG 04-22 01:54:49 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.7 ms +(EngineCore pid=245) DEBUG 04-22 01:54:49 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=245) DEBUG 04-22 01:54:49 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(APIServer pid=1) DEBUG 04-22 01:54:49 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=245) DEBUG 04-22 01:54:49 [compilation/backends.py:377] Store the 32-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_32', '/data/.cache/vllm/torch_compile_cache/b8b812d5c5/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_32') +(EngineCore pid=245) INFO 04-22 01:54:49 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 7.36 s +(EngineCore pid=245) DEBUG 04-22 01:54:50 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/b8b812d5c5/rank_0_0/backbone/computation_graph.py +(EngineCore pid=245) INFO 04-22 01:54:52 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/b93efa87a050be75284475ae3f54704fbf7b8794a49f695fb7f3527dd72f6157/rank_0_0/model +(EngineCore pid=245) INFO 04-22 01:54:52 [compilation/monitor.py:48] torch.compile took 17.02 s in total +(EngineCore pid=245) INFO 04-22 01:54:52 [compilation/monitor.py:76] Initial profiling/warmup run took 0.32 s +(EngineCore pid=245) INFO 04-22 01:54:57 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=245) DEBUG 04-22 01:54:57 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=245) INFO 04-22 01:54:57 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=245) DEBUG 04-22 01:54:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=245) DEBUG 04-22 01:54:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=245) DEBUG 04-22 01:54:58 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=245) DEBUG 04-22 01:54:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=245) DEBUG 04-22 01:54:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=245) DEBUG 04-22 01:54:58 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=245) DEBUG 04-22 01:54:58 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 98.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=245) DEBUG 04-22 01:54:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=245) DEBUG 04-22 01:54:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=245) DEBUG 04-22 01:54:58 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=245) DEBUG 04-22 01:54:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=245) DEBUG 04-22 01:54:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=245) DEBUG 04-22 01:54:58 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=245) DEBUG 04-22 01:54:58 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 260.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=245) DEBUG 04-22 01:54:59 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=245) INFO 04-22 01:54:59 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.84 GiB total +(EngineCore pid=245) DEBUG 04-22 01:54:59 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=245) DEBUG 04-22 01:54:59 [v1/worker/gpu_worker.py:430] Free memory after profiling: 69.57 GiB (total), 66.12 GiB (within requested) +(EngineCore pid=245) DEBUG 04-22 01:54:59 [v1/worker/gpu_worker.py:435] Memory profiling takes 24.38 seconds. Total non KV cache memory: 10.62GiB; torch peak memory increase: 1.89GiB; non-torch forward increase memory: 0.25GiB; weights memory: 8.49GiB. +(EngineCore pid=245) INFO 04-22 01:54:59 [v1/worker/gpu_worker.py:436] Available KV cache memory: 64.61 GiB +(EngineCore pid=245) INFO 04-22 01:54:59 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9606 to maintain the same effective KV cache size. +(EngineCore pid=245) INFO 04-22 01:54:59 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 529,248 tokens +(EngineCore pid=245) INFO 04-22 01:54:59 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 64.61x +(EngineCore pid=245) 2026-04-22 01:54:59,423 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=245) DEBUG 04-22 01:54:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=245) 2026-04-22 01:54:59,431 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=245) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:08:14 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:08:14 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 00:08:14 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:08:14 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 00:08:14 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 00:08:14 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-3.1-8B-Instruct +(APIServer pid=1) INFO 04-22 00:08:14 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 00:08:14 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:08:14 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-3.1-8B-Instruct', 'model': 'meta-llama/Llama-3.1-8B-Instruct', 'dtype': 'float32', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 00:08:14 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 00:08:15 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 00:08:15 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0031874 secs +(APIServer pid=1) INFO 04-22 00:08:15 [config/model.py:549] Resolved architecture: LlamaForCausalLM +(APIServer pid=1) INFO 04-22 00:08:15 [config/model.py:2010] Upcasting torch.bfloat16 to torch.float32. +(APIServer pid=1) INFO 04-22 00:08:15 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 00:08:15 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 00:08:15 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 00:08:15 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 00:08:15 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 00:08:15 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 00:08:15 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 00:08:15 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 00:08:15 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 00:08:15 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:08:15 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:08:15 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 00:08:19 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:08:19 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:08:19 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:08:19 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:08:19 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:08:19 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:08:19 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:08:19 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:08:19 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:08:19 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:08:19 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:08:19 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:08:24 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=244) DEBUG 04-22 00:08:25 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 00:08:25 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=244) DEBUG 04-22 00:08:25 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/1fb5a54e-3c4f-4104-a23e-0494dac61c29'], outputs=['ipc:///tmp/208aeb77-47c0-41a4-960c-7a979382f661'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=244) DEBUG 04-22 00:08:25 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=244) DEBUG 04-22 00:08:25 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=244) DEBUG 04-22 00:08:25 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=244) DEBUG 04-22 00:08:25 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=244) DEBUG 04-22 00:08:25 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=244) INFO 04-22 00:08:25 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float32, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=244) DEBUG 04-22 00:08:26 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.23:60703 backend=nccl +(EngineCore pid=244) INFO 04-22 00:08:26 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.23:60703 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=244) DEBUG 04-22 00:08:26 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=244) INFO 04-22 00:08:26 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=244) DEBUG 04-22 00:08:26 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776816506.7239153, auto_measure=True +(EngineCore pid=244) DEBUG 04-22 00:08:26 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=244) DEBUG 04-22 00:08:26 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=244) DEBUG 04-22 00:08:26 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=244) DEBUG 04-22 00:08:26 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=244) DEBUG 04-22 00:08:26 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=244) DEBUG 04-22 00:08:26 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=244) DEBUG 04-22 00:08:26 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=244) DEBUG 04-22 00:08:26 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=244) INFO 04-22 00:08:26 [v1/worker/gpu_model_runner.py:4735] Starting to load model meta-llama/Llama-3.1-8B-Instruct... +(EngineCore pid=244) DEBUG 04-22 00:08:27 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.float32, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {FLASH_ATTN: [dtype not supported], FLASHINFER: [dtype not supported]}. +(EngineCore pid=244) INFO 04-22 00:08:27 [platforms/cuda.py:334] Using TRITON_ATTN attention backend out of potential backends: ['TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=244) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=244) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=244) DEBUG 04-22 00:08:27 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=244) DEBUG 04-22 00:08:27 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=244) DEBUG 04-22 00:08:27 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=244) DEBUG 04-22 00:08:27 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=244) DEBUG 04-22 00:08:27 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00004-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00002-of-00004.safetensors']] +(EngineCore pid=244) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=244) INFO 04-22 00:08:43 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/9bfd138913/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=78754e407e comp=e546579c48 code=d4902a9f99bbc2392c45e4b4dc801f4424a11306edb95a61851b1062db20b8c4 dir=/data/.cache/vllm/torch_compile_cache/9bfd138913/rank_0_0/backbone +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] Vllm config hash: 78754e407e +(EngineCore pid=244) INFO 04-22 00:08:43 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.25 s +(EngineCore pid=244) DEBUG 04-22 00:08:44 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=244) DEBUG 04-22 00:08:44 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=244) /usr/local/lib/python3.12/dist-packages/torch/_inductor/compile_fx.py:321: UserWarning: TensorFloat32 tensor cores for float32 matrix multiplication available but not enabled. Consider setting `torch.set_float32_matmul_precision('high')` for better performance. +(EngineCore pid=244) warnings.warn( +(EngineCore pid=244) DEBUG 04-22 00:08:44 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=244) DEBUG 04-22 00:08:44 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms +(EngineCore pid=244) DEBUG 04-22 00:08:44 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms +(EngineCore pid=244) DEBUG 04-22 00:08:44 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=244) DEBUG 04-22 00:08:44 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(APIServer pid=1) DEBUG 04-22 00:08:45 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=244) INFO 04-22 00:08:46 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=244) DEBUG 04-22 00:08:46 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/9bfd138913/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=244) DEBUG 04-22 00:08:46 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=244) DEBUG 04-22 00:08:46 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(EngineCore pid=244) DEBUG 04-22 00:08:46 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms +(EngineCore pid=244) DEBUG 04-22 00:08:46 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=244) DEBUG 04-22 00:08:46 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=244) DEBUG 04-22 00:08:47 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/9bfd138913/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=244) DEBUG 04-22 00:08:48 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=244) DEBUG 04-22 00:08:48 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms +(EngineCore pid=244) DEBUG 04-22 00:08:48 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms +(EngineCore pid=244) DEBUG 04-22 00:08:48 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=244) DEBUG 04-22 00:08:48 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(EngineCore pid=244) DEBUG 04-22 00:08:49 [compilation/backends.py:377] Store the 32-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_32', '/data/.cache/vllm/torch_compile_cache/9bfd138913/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_32') +(EngineCore pid=244) INFO 04-22 00:08:49 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.10 s +(EngineCore pid=244) DEBUG 04-22 00:08:49 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/9bfd138913/rank_0_0/backbone/computation_graph.py +(EngineCore pid=244) INFO 04-22 00:08:50 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/5785960f71019408b49f0614e6897e9613551919aefdda4bdab95f0e33283b91/rank_0_0/model +(EngineCore pid=244) INFO 04-22 00:08:50 [compilation/monitor.py:48] torch.compile took 10.81 s in total +(EngineCore pid=244) INFO 04-22 00:08:53 [compilation/monitor.py:76] Initial profiling/warmup run took 2.57 s +(APIServer pid=1) DEBUG 04-22 00:08:55 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=244) INFO 04-22 00:08:58 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=244) DEBUG 04-22 00:08:58 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=244) INFO 04-22 00:08:58 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=244) DEBUG 04-22 00:08:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:08:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:08:58 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-22 00:08:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:08:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:08:59 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-22 00:08:59 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 224.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=244) DEBUG 04-22 00:08:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:08:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:08:59 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-22 00:09:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:09:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:09:00 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-22 00:09:00 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 4.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=244) DEBUG 04-22 00:09:00 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=244) INFO 04-22 00:09:00 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.80 GiB total +(EngineCore pid=244) DEBUG 04-22 00:09:01 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=244) DEBUG 04-22 00:09:01 [v1/worker/gpu_worker.py:430] Free memory after profiling: 46.29 GiB (total), 42.84 GiB (within requested) +(EngineCore pid=244) DEBUG 04-22 00:09:01 [v1/worker/gpu_worker.py:435] Memory profiling takes 21.36 seconds. Total non KV cache memory: 32.43GiB; torch peak memory increase: 2.21GiB; non-torch forward increase memory: 0.25GiB; weights memory: 29.98GiB. +(EngineCore pid=244) INFO 04-22 00:09:01 [v1/worker/gpu_worker.py:436] Available KV cache memory: 42.8 GiB +(EngineCore pid=244) INFO 04-22 00:09:01 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9602 to maintain the same effective KV cache size. +(EngineCore pid=244) INFO 04-22 00:09:01 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 175,296 tokens +(EngineCore pid=244) INFO 04-22 00:09:01 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 21.40x +(EngineCore pid=244) 2026-04-22 00:09:01,121 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=244) DEBUG 04-22 00:09:01 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) 2026-04-22 00:09:01,135 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=244) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:47:15 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:47:15 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 01:47:15 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:47:15 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 01:47:15 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 01:47:15 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-3.1-8B-Instruct +(APIServer pid=1) INFO 04-22 01:47:15 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 01:47:15 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:47:15 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-3.1-8B-Instruct', 'model': 'meta-llama/Llama-3.1-8B-Instruct', 'max_model_len': 16384, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 01:47:15 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 01:47:15 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 01:47:15 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0005018 secs +(APIServer pid=1) INFO 04-22 01:47:15 [config/model.py:549] Resolved architecture: LlamaForCausalLM +(APIServer pid=1) INFO 04-22 01:47:15 [config/model.py:1678] Using max model len 16384 +(APIServer pid=1) DEBUG 04-22 01:47:15 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 01:47:15 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 01:47:15 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 01:47:15 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 01:47:15 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 01:47:15 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 01:47:15 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 01:47:15 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 01:47:15 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:47:16 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:47:16 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 01:47:19 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:47:19 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:47:19 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:47:19 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:47:19 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:47:19 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:47:19 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:47:19 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:47:19 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:47:19 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:47:19 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:47:19 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:47:24 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=243) DEBUG 04-22 01:47:25 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 01:47:25 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=243) DEBUG 04-22 01:47:25 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/354c5b62-d6ba-4a15-b1bf-57fa739977cb'], outputs=['ipc:///tmp/2cd607a0-3d07-47c0-8b87-18d629b2f761'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=243) DEBUG 04-22 01:47:25 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=243) DEBUG 04-22 01:47:25 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=243) DEBUG 04-22 01:47:25 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=243) DEBUG 04-22 01:47:25 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=243) DEBUG 04-22 01:47:25 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=243) INFO 04-22 01:47:25 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16384, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=243) DEBUG 04-22 01:47:26 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.37:35185 backend=nccl +(EngineCore pid=243) INFO 04-22 01:47:26 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.37:35185 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=243) DEBUG 04-22 01:47:26 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=243) INFO 04-22 01:47:26 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=243) DEBUG 04-22 01:47:26 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776822446.960836, auto_measure=True +(EngineCore pid=243) DEBUG 04-22 01:47:26 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=243) DEBUG 04-22 01:47:27 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=243) DEBUG 04-22 01:47:27 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=243) DEBUG 04-22 01:47:27 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=243) DEBUG 04-22 01:47:27 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=243) DEBUG 04-22 01:47:27 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=243) DEBUG 04-22 01:47:27 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=243) DEBUG 04-22 01:47:27 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=243) INFO 04-22 01:47:27 [v1/worker/gpu_model_runner.py:4735] Starting to load model meta-llama/Llama-3.1-8B-Instruct... +(EngineCore pid=243) DEBUG 04-22 01:47:27 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=243) INFO 04-22 01:47:27 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=243) INFO 04-22 01:47:27 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=243) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=243) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=243) DEBUG 04-22 01:47:27 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=243) DEBUG 04-22 01:47:27 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=243) DEBUG 04-22 01:47:27 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=243) DEBUG 04-22 01:47:27 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=243) DEBUG 04-22 01:47:28 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'model-00004-of-00004.safetensors']] +(EngineCore pid=243) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=243) INFO 04-22 01:47:42 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/ff8681c1a3/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=84e6e50ad7 comp=e546579c48 code=d4902a9f99bbc2392c45e4b4dc801f4424a11306edb95a61851b1062db20b8c4 dir=/data/.cache/vllm/torch_compile_cache/ff8681c1a3/rank_0_0/backbone +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] Vllm config hash: 84e6e50ad7 +(EngineCore pid=243) INFO 04-22 01:47:42 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.36 s +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.7 ms +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=243) INFO 04-22 01:47:44 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=243) DEBUG 04-22 01:47:44 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/ff8681c1a3/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=243) DEBUG 04-22 01:47:44 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 01:47:44 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(EngineCore pid=243) DEBUG 04-22 01:47:44 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.0 ms +(EngineCore pid=243) DEBUG 04-22 01:47:44 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 01:47:44 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(APIServer pid=1) DEBUG 04-22 01:47:45 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=243) DEBUG 04-22 01:47:46 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/ff8681c1a3/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=243) DEBUG 04-22 01:47:47 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=243) DEBUG 04-22 01:47:47 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms +(EngineCore pid=243) DEBUG 04-22 01:47:47 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms +(EngineCore pid=243) DEBUG 04-22 01:47:47 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=243) DEBUG 04-22 01:47:47 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(EngineCore pid=243) DEBUG 04-22 01:47:47 [compilation/backends.py:377] Store the 32-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_32', '/data/.cache/vllm/torch_compile_cache/ff8681c1a3/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_32') +(EngineCore pid=243) INFO 04-22 01:47:47 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.20 s +(EngineCore pid=243) DEBUG 04-22 01:47:47 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/ff8681c1a3/rank_0_0/backbone/computation_graph.py +(EngineCore pid=243) INFO 04-22 01:47:48 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/53c51fc11d01c95c773fe7cf86dfcdc35a297d8ecf40ebf20a05b5e48e2fba95/rank_0_0/model +(EngineCore pid=243) INFO 04-22 01:47:48 [compilation/monitor.py:48] torch.compile took 11.07 s in total +(EngineCore pid=243) INFO 04-22 01:47:49 [compilation/monitor.py:76] Initial profiling/warmup run took 0.45 s +(EngineCore pid=243) INFO 04-22 01:47:54 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=243) DEBUG 04-22 01:47:54 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=243) INFO 04-22 01:47:54 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=243) DEBUG 04-22 01:47:55 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:47:55 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:47:55 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 01:47:55 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:47:55 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:47:55 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 01:47:55 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 138.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=243) DEBUG 04-22 01:47:55 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:47:55 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:47:55 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 01:47:55 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:47:55 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) DEBUG 04-22 01:47:55 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=243) DEBUG 04-22 01:47:55 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 260.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=243) DEBUG 04-22 01:47:55 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=243) INFO 04-22 01:47:55 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.84 GiB total +(APIServer pid=1) DEBUG 04-22 01:47:55 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=243) DEBUG 04-22 01:47:55 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=243) DEBUG 04-22 01:47:55 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.24 GiB (total), 58.79 GiB (within requested) +(EngineCore pid=243) DEBUG 04-22 01:47:55 [v1/worker/gpu_worker.py:435] Memory profiling takes 18.17 seconds. Total non KV cache memory: 17.12GiB; torch peak memory increase: 1.89GiB; non-torch forward increase memory: 0.25GiB; weights memory: 14.99GiB. +(EngineCore pid=243) INFO 04-22 01:47:55 [v1/worker/gpu_worker.py:436] Available KV cache memory: 58.11 GiB +(EngineCore pid=243) INFO 04-22 01:47:55 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9606 to maintain the same effective KV cache size. +(EngineCore pid=243) INFO 04-22 01:47:55 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 476,016 tokens +(EngineCore pid=243) INFO 04-22 01:47:55 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 16,384 tokens per request: 29.05x +(EngineCore pid=243) 2026-04-22 01:47:55,992 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=243) DEBUG 04-22 01:47:55 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=243) 2026-04-22 01:47:56,001 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=243) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:29:59 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:29:59 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 01:29:59 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:29:59 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 01:29:59 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 01:29:59 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-4-Scout-17B-16E-Instruct +(APIServer pid=1) INFO 04-22 01:29:59 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 01:29:59 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 01:29:59 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-4-Scout-17B-16E-Instruct', 'model': 'meta-llama/Llama-4-Scout-17B-16E-Instruct', 'max_model_len': 8192, 'tensor_parallel_size': 4, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 01:29:59 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 01:30:00 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.mllama4.Llama4ForConditionalGeneration from cache +(APIServer pid=1) DEBUG 04-22 01:30:00 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0008244 secs +(APIServer pid=1) INFO 04-22 01:30:00 [config/model.py:549] Resolved architecture: Llama4ForConditionalGeneration +(APIServer pid=1) INFO 04-22 01:30:00 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 01:30:00 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 01:30:00 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 01:30:00 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 01:30:00 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 01:30:00 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 01:30:00 [config/parallel.py:743] Defaulting to use mp for distributed inference +(APIServer pid=1) INFO 04-22 01:30:00 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 01:30:00 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(APIServer pid=1) INFO 04-22 01:30:01 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(APIServer pid=1) DEBUG 04-22 01:30:01 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 01:30:01 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:30:02 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 01:30:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(APIServer pid=1) DEBUG 04-22 01:30:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(APIServer pid=1) DEBUG 04-22 01:30:03 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. +(APIServer pid=1) DEBUG 04-22 01:30:03 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 01:30:13 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:30:13 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:30:13 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:30:13 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:30:13 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:30:13 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:30:13 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:30:13 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:30:13 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:30:13 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:30:13 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:30:13 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:30:17 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=470) DEBUG 04-22 01:30:19 [v1/engine/core.py:1018] Waiting for init message from front-end. +(EngineCore pid=470) DEBUG 04-22 01:30:19 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/d183e7bb-6eef-4da4-943b-f32769998c39'], outputs=['ipc:///tmp/1364cfc6-7766-4264-bfe9-0dd5982225b8'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(APIServer pid=1) DEBUG 04-22 01:30:19 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=470) DEBUG 04-22 01:30:19 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=470) DEBUG 04-22 01:30:19 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=470) DEBUG 04-22 01:30:19 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=470) DEBUG 04-22 01:30:19 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=470) DEBUG 04-22 01:30:19 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=470) INFO 04-22 01:30:19 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-4-Scout-17B-16E-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-4-Scout-17B-16E-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=4, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-4-Scout-17B-16E-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [204, 8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=470) WARNING 04-22 01:30:19 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. +(EngineCore pid=470) INFO 04-22 01:30:19 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.131.7.87 (local), world_size=4, local_world_size=4 +(EngineCore pid=470) DEBUG 04-22 01:30:19 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/e7240d42-f49f-4d17-9d01-cc24e0453b78 +(EngineCore pid=470) DEBUG 04-22 01:30:19 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1, 2, 3], buffer_handle=(4, 16777216, 10, 'psm_2b2306a0'), local_subscribe_addr='ipc:///tmp/e7240d42-f49f-4d17-9d01-cc24e0453b78', local_notify_addr='ipc:///tmp/36af057c-4fa5-42de-9705-f0cd0cc565fb', remote_subscribe_addr=None, remote_addr_ipv6=False) +DEBUG 04-22 01:30:22 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:30:22 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:30:22 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:30:22 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:30:22 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:30:22 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:30:22 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:30:22 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:30:22 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:30:22 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:30:22 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:30:22 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:30:22 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:30:22 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:30:22 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:30:22 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:30:22 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:30:22 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:30:22 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:30:22 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:30:22 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:30:22 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:30:22 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:30:22 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:30:22 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:30:22 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 01:30:22 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:30:22 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 01:30:22 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:30:22 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 01:30:22 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:30:22 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:30:22 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:30:22 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:30:22 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:30:22 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:30:22 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:30:22 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:30:22 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:30:22 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 01:30:22 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 01:30:22 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 01:30:22 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 01:30:22 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 01:30:22 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:30:22 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:30:22 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 01:30:22 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 01:30:27 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:30:27 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:30:27 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:30:27 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 01:30:28 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:30:28 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:30:28 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:30:28 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 01:30:29 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:30:29 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:30:29 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:30:29 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 01:30:29 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:30:29 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:30:29 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:30:29 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 01:30:29 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 01:30:29 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 01:30:29 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 01:30:29 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) DEBUG 04-22 01:30:29 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +DEBUG 04-22 01:30:29 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +DEBUG 04-22 01:30:29 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +DEBUG 04-22 01:30:29 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +DEBUG 04-22 01:30:29 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +DEBUG 04-22 01:30:29 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +DEBUG 04-22 01:30:29 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +DEBUG 04-22 01:30:29 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +DEBUG 04-22 01:30:29 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +DEBUG 04-22 01:30:29 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +DEBUG 04-22 01:30:29 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +DEBUG 04-22 01:30:29 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +DEBUG 04-22 01:30:29 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(Worker pid=669) DEBUG 04-22 01:30:30 [distributed/parallel_state.py:1356] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:38185 backend=nccl +(Worker pid=669) INFO 04-22 01:30:30 [distributed/parallel_state.py:1400] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:38185 backend=nccl +(Worker pid=670) DEBUG 04-22 01:30:31 [distributed/parallel_state.py:1356] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:38185 backend=nccl +(Worker pid=670) INFO 04-22 01:30:31 [distributed/parallel_state.py:1400] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:38185 backend=nccl +(Worker pid=671) DEBUG 04-22 01:30:31 [distributed/parallel_state.py:1356] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:38185 backend=nccl +(Worker pid=671) INFO 04-22 01:30:31 [distributed/parallel_state.py:1400] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:38185 backend=nccl +(Worker pid=672) DEBUG 04-22 01:30:31 [distributed/parallel_state.py:1356] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:38185 backend=nccl +(Worker pid=672) INFO 04-22 01:30:31 [distributed/parallel_state.py:1400] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:38185 backend=nccl +[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +(Worker pid=670) DEBUG 04-22 01:30:31 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=671) DEBUG 04-22 01:30:31 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=672) DEBUG 04-22 01:30:31 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=669) DEBUG 04-22 01:30:31 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +(Worker pid=670) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=671) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=670) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=671) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=672) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=672) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=669) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=669) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=669) DEBUG 04-22 01:30:31 [utils/nccl.py:34] Found nccl from library libnccl.so.2 +(Worker pid=669) INFO 04-22 01:30:31 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 +(Worker pid=670) DEBUG 04-22 01:30:32 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=671) DEBUG 04-22 01:30:32 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=672) DEBUG 04-22 01:30:32 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=669) DEBUG 04-22 01:30:32 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=669) DEBUG 04-22 01:30:32 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/2470ef01-879c-4c68-8a4f-d117d276b4de +(Worker pid=669) DEBUG 04-22 01:30:32 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1, 2, 3], buffer_handle=(3, 4194304, 6, 'psm_62c7f776'), local_subscribe_addr='ipc:///tmp/2470ef01-879c-4c68-8a4f-d117d276b4de', local_notify_addr='ipc:///tmp/7101c424-5035-49eb-b6bb-ea7d4a2f6f23', remote_subscribe_addr=None, remote_addr_ipv6=False) +(Worker pid=671) DEBUG 04-22 01:30:32 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/2470ef01-879c-4c68-8a4f-d117d276b4de +(Worker pid=672) DEBUG 04-22 01:30:32 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/2470ef01-879c-4c68-8a4f-d117d276b4de +(Worker pid=670) DEBUG 04-22 01:30:32 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/2470ef01-879c-4c68-8a4f-d117d276b4de +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +(Worker pid=669) INFO 04-22 01:30:33 [distributed/parallel_state.py:1716] rank 0 in world size 4 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N/A +(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] WorkerProc failed to start. +(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] Traceback (most recent call last): +(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 826, in worker_main +(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] worker = WorkerProc(*args, **kwargs) +(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) +(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ +(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 605, in __init__ +(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] self.worker.init_device() +(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 312, in init_device +(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] self.worker.init_device() # type: ignore +(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) +(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ +(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 283, in init_device +(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] self.requested_memory = request_memory(init_snapshot, self.cache_config) +(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/utils.py", line 413, in request_memory +(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] raise ValueError( +(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] ValueError: Free memory on device cuda:1 (60.33/79.19 GiB) on startup is less than desired GPU memory utilization (0.95, 75.23 GiB). Decrease GPU memory utilization or reduce GPU memory used by other processes. +(EngineCore pid=470) DEBUG 04-22 01:30:33 [v1/executor/multiproc_executor.py:419] Worker Termination: allow workers to gracefully shutdown +(Worker pid=672) DEBUG 04-22 01:30:33 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.12GiB, total_memory=79.19GiB, cuda_memory=2.07GiB, torch_memory=0.02GiB, non_torch_memory=2.05GiB, timestamp=1776821433.6940672, auto_measure=True +(Worker pid=672) DEBUG 04-22 01:30:33 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=669) DEBUG 04-22 01:30:33 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.12GiB, total_memory=79.19GiB, cuda_memory=2.07GiB, torch_memory=0.02GiB, non_torch_memory=2.05GiB, timestamp=1776821433.7040389, auto_measure=True +(Worker pid=669) DEBUG 04-22 01:30:33 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=671) DEBUG 04-22 01:30:33 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.12GiB, total_memory=79.19GiB, cuda_memory=2.07GiB, torch_memory=0.02GiB, non_torch_memory=2.05GiB, timestamp=1776821433.7163646, auto_measure=True +(Worker pid=671) DEBUG 04-22 01:30:33 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=672) DEBUG 04-22 01:30:33 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=672) DEBUG 04-22 01:30:33 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=669) DEBUG 04-22 01:30:33 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=669) DEBUG 04-22 01:30:33 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=671) DEBUG 04-22 01:30:33 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=671) DEBUG 04-22 01:30:33 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=672) DEBUG 04-22 01:30:33 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=669) DEBUG 04-22 01:30:33 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=672) DEBUG 04-22 01:30:33 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=669) DEBUG 04-22 01:30:33 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(Worker pid=671) DEBUG 04-22 01:30:33 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=669) DEBUG 04-22 01:30:33 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=671) DEBUG 04-22 01:30:33 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=470) DEBUG 04-22 01:30:37 [v1/executor/multiproc_executor.py:424] Worker Termination: workers still running sending SIGTERM +(Worker pid=672) DEBUG 04-22 01:30:37 [v1/executor/multiproc_executor.py:794] WorkerProc handling signal 15, raising SystemExit +(Worker pid=672) Exception ignored in: TypeError("'str' object cannot be converted to 'AddedToken'") +(Worker pid=672) Traceback (most recent call last): +(Worker pid=672) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 797, in signal_handler +(Worker pid=672) raise SystemExit() +(Worker pid=672) SystemExit: +(Worker pid=669) DEBUG 04-22 01:30:37 [v1/executor/multiproc_executor.py:794] WorkerProc handling signal 15, raising SystemExit +(Worker pid=669) WARNING 04-22 01:30:37 [v1/executor/multiproc_executor.py:871] WorkerProc was terminated +(Worker pid=671) DEBUG 04-22 01:30:37 [v1/executor/multiproc_executor.py:794] WorkerProc handling signal 15, raising SystemExit +(Worker pid=671) WARNING 04-22 01:30:37 [v1/executor/multiproc_executor.py:871] WorkerProc was terminated +[rank0]:[W422 01:30:38.196588814 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) +(APIServer pid=1) DEBUG 04-22 01:30:39 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +[rank3]:[W422 01:30:39.076626420 TCPStore.cpp:125] [c10d] recvValue failed on SocketImpl(fd=71, addr=[localhost]:58688, remote=[localhost]:38185): Failed to recv, got 0 bytes. Connection was likely closed. Did the remote server shutdown or crash? +Exception raised from recvBytes at /pytorch/torch/csrc/distributed/c10d/Utils.hpp:682 (most recent call first): +frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string, std::allocator >) + 0x9d (0x7efc950ddfdd in /usr/local/lib/python3.12/dist-packages/torch/lib/libc10.so) +frame #1: + 0x6a3325d (0x7efbaf67d25d in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) +frame #2: c10d::TCPStore::check(std::vector, std::allocator >, std::allocator, std::allocator > > > const&) + 0x273 (0x7efbaf67b1f3 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) +frame #3: c10d::ProcessGroupNCCL::HeartbeatMonitor::runLoop() + 0x44c (0x7efb50df79cc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) +frame #4: + 0xdc253 (0x7efc564b0253 in /lib/x86_64-linux-gnu/libstdc++.so.6) +frame #5: + 0x94ac3 (0x7efc95e39ac3 in /lib/x86_64-linux-gnu/libc.so.6) +frame #6: clone + 0x44 (0x7efc95ecaa84 in /lib/x86_64-linux-gnu/libc.so.6) + +[rank3]:[W422 01:30:39.079729534 ProcessGroupNCCL.cpp:1802] [PG ID 0 PG GUID 0 Rank 3] Failed to check the "should dump" flag on TCPStore, (maybe TCPStore server has shut down too early), with error: Failed to recv, got 0 bytes. Connection was likely closed. Did the remote server shutdown or crash? +[rank3]:[W422 01:30:40.079908370 TCPStore.cpp:106] [c10d] sendBytes failed on SocketImpl(fd=71, addr=[localhost]:58688, remote=[localhost]:38185): Broken pipe +Exception raised from sendBytes at /pytorch/torch/csrc/distributed/c10d/Utils.hpp:653 (most recent call first): +frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string, std::allocator >) + 0x9d (0x7efc950ddfdd in /usr/local/lib/python3.12/dist-packages/torch/lib/libc10.so) +frame #1: + 0x6a326d1 (0x7efbaf67c6d1 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) +frame #2: c10d::TCPStore::check(std::vector, std::allocator >, std::allocator, std::allocator > > > const&) + 0x24d (0x7efbaf67b1cd in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) +frame #3: c10d::ProcessGroupNCCL::HeartbeatMonitor::runLoop() + 0x44c (0x7efb50df79cc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) +frame #4: + 0xdc253 (0x7efc564b0253 in /lib/x86_64-linux-gnu/libstdc++.so.6) +frame #5: + 0x94ac3 (0x7efc95e39ac3 in /lib/x86_64-linux-gnu/libc.so.6) +frame #6: clone + 0x44 (0x7efc95ecaa84 in /lib/x86_64-linux-gnu/libc.so.6) + +[rank3]:[W422 01:30:40.084561021 ProcessGroupNCCL.cpp:1802] [PG ID 0 PG GUID 0 Rank 3] Failed to check the "should dump" flag on TCPStore, (maybe TCPStore server has shut down too early), with error: Broken pipe +(Worker_TP3 pid=672) DEBUG 04-22 01:30:40 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP3 pid=672) DEBUG 04-22 01:30:40 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP3 pid=672) DEBUG 04-22 01:30:40 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 34}) +(Worker_TP3 pid=672) DEBUG 04-22 01:30:40 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 133, 'silu_and_mul': 48, 'fused_moe': 48, 'unquantized_fused_moe': 48, 'rotary_embedding': 2, 'apply_rotary_emb': 2, 'vocab_parallel_embedding': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP3 pid=672) DEBUG 04-22 01:30:40 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 34}) +(Worker_TP3 pid=672) DEBUG 04-22 01:30:40 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 133, 'silu_and_mul': 48, 'fused_moe': 48, 'unquantized_fused_moe': 48, 'rotary_embedding': 2, 'apply_rotary_emb': 2, 'vocab_parallel_embedding': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP3 pid=672) DEBUG 04-22 01:30:40 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP3 pid=672) DEBUG 04-22 01:30:41 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00050-of-00050.safetensors', 'model-00031-of-00050.safetensors', 'model-00029-of-00050.safetensors', 'model-00011-of-00050.safetensors', 'model-00048-of-00050.safetensors', 'model-00038-of-00050.safetensors', 'model-00010-of-00050.safetensors', 'model-00015-of-00050.safetensors', 'model-00045-of-00050.safetensors', 'model-00009-of-00050.safetensors', 'model-00016-of-00050.safetensors', 'model-00013-of-00050.safetensors', 'model-00025-of-00050.safetensors', 'model-00026-of-00050.safetensors', 'model-00018-of-00050.safetensors', 'model-00008-of-00050.safetensors', 'model-00037-of-00050.safetensors', 'model-00034-of-00050.safetensors', 'model-00005-of-00050.safetensors', 'model-00002-of-00050.safetensors', 'model-00006-of-00050.safetensors', 'model-00035-of-00050.safetensors', 'model-00003-of-00050.safetensors', 'model-00040-of-00050.safetensors', 'model-00012-of-00050.safetensors', 'model-00004-of-00050.safetensors', 'model-00020-of-00050.safetensors', 'model-00022-of-00050.safetensors', 'model-00047-of-00050.safetensors', 'model-00030-of-00050.safetensors', 'model-00039-of-00050.safetensors', 'model-00028-of-00050.safetensors', 'model-00032-of-00050.safetensors', 'model-00023-of-00050.safetensors', 'model-00041-of-00050.safetensors', 'model-00044-of-00050.safetensors', 'model-00017-of-00050.safetensors', 'model-00043-of-00050.safetensors', 'model-00049-of-00050.safetensors', 'model-00024-of-00050.safetensors', 'model-00046-of-00050.safetensors', 'model-00033-of-00050.safetensors', 'model-00019-of-00050.safetensors', 'model-00021-of-00050.safetensors', 'model-00007-of-00050.safetensors', 'model-00027-of-00050.safetensors', 'model-00036-of-00050.safetensors', 'model-00001-of-00050.safetensors', 'model-00042-of-00050.safetensors', 'model-00014-of-00050.safetensors']] +[rank3]:[W422 01:30:41.084720813 TCPStore.cpp:106] [c10d] sendBytes failed on SocketImpl(fd=71, addr=[localhost]:58688, remote=[localhost]:38185): Broken pipe +Exception raised from sendBytes at /pytorch/torch/csrc/distributed/c10d/Utils.hpp:653 (most recent call first): +frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string, std::allocator >) + 0x9d (0x7efc950ddfdd in /usr/local/lib/python3.12/dist-packages/torch/lib/libc10.so) +frame #1: + 0x6a326d1 (0x7efbaf67c6d1 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) +frame #2: c10d::TCPStore::check(std::vector, std::allocator >, std::allocator, std::allocator > > > const&) + 0x24d (0x7efbaf67b1cd in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) +frame #3: c10d::ProcessGroupNCCL::HeartbeatMonitor::runLoop() + 0x44c (0x7efb50df79cc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) +frame #4: + 0xdc253 (0x7efc564b0253 in /lib/x86_64-linux-gnu/libstdc++.so.6) +frame #5: + 0x94ac3 (0x7efc95e39ac3 in /lib/x86_64-linux-gnu/libc.so.6) +frame #6: clone + 0x44 (0x7efc95ecaa84 in /lib/x86_64-linux-gnu/libc.so.6) + +[rank3]:[W422 01:30:41.087290159 ProcessGroupNCCL.cpp:1802] [PG ID 0 PG GUID 0 Rank 3] Failed to check the "should dump" flag on TCPStore, (maybe TCPStore server has shut down too early), with error: Broken pipe +(EngineCore pid=470) DEBUG 04-22 01:30:41 [v1/executor/multiproc_executor.py:429] Worker Termination: resorting to SIGKILL to take down workers +(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] EngineCore failed to start. +(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] Traceback (most recent call last): +(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1082, in run_engine_core +(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs) +(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] return func(*args, **kwargs) +(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^ +(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 848, in __init__ +(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] super().__init__( +(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 114, in __init__ +(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] self.model_executor = executor_class(vllm_config) +(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 101, in __init__ +(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] super().__init__(vllm_config) +(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] return func(*args, **kwargs) +(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^ +(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 103, in __init__ +(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] self._init_executor() +(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 190, in _init_executor +(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] self.workers = WorkerProc.wait_for_ready(unready_workers) +(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 736, in wait_for_ready +(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] raise e from None +(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] Exception: WorkerProc initialization failed due to an exception in a background process. See stack trace for root cause. +(EngineCore pid=470) Process EngineCore: +(EngineCore pid=470) Traceback (most recent call last): +(EngineCore pid=470) File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap +(EngineCore pid=470) self.run() +(EngineCore pid=470) File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run +(EngineCore pid=470) self._target(*self._args, **self._kwargs) +(EngineCore pid=470) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1112, in run_engine_core +(EngineCore pid=470) raise e +(EngineCore pid=470) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1082, in run_engine_core +(EngineCore pid=470) engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs) +(EngineCore pid=470) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore pid=470) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(EngineCore pid=470) return func(*args, **kwargs) +(EngineCore pid=470) ^^^^^^^^^^^^^^^^^^^^^ +(EngineCore pid=470) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 848, in __init__ +(EngineCore pid=470) super().__init__( +(EngineCore pid=470) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 114, in __init__ +(EngineCore pid=470) self.model_executor = executor_class(vllm_config) +(EngineCore pid=470) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore pid=470) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 101, in __init__ +(EngineCore pid=470) super().__init__(vllm_config) +(EngineCore pid=470) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(EngineCore pid=470) return func(*args, **kwargs) +(EngineCore pid=470) ^^^^^^^^^^^^^^^^^^^^^ +(EngineCore pid=470) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 103, in __init__ +(EngineCore pid=470) self._init_executor() +(EngineCore pid=470) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 190, in _init_executor +(EngineCore pid=470) self.workers = WorkerProc.wait_for_ready(unready_workers) +(EngineCore pid=470) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(EngineCore pid=470) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 736, in wait_for_ready +(EngineCore pid=470) raise e from None +(EngineCore pid=470) Exception: WorkerProc initialization failed due to an exception in a background process. See stack trace for root cause. +(EngineCore pid=470) DEBUG 04-22 01:30:42 [v1/executor/multiproc_executor.py:438] Triggering shutdown of workers +(APIServer pid=1) Traceback (most recent call last): +(APIServer pid=1) File "/usr/local/bin/vllm", line 10, in +(APIServer pid=1) sys.exit(main()) +(APIServer pid=1) ^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main +(APIServer pid=1) args.dispatch_function(args) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd +(APIServer pid=1) uvloop.run(run_server(args)) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run +(APIServer pid=1) return __asyncio.run( +(APIServer pid=1) ^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run +(APIServer pid=1) return runner.run(main) +(APIServer pid=1) ^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run +(APIServer pid=1) return self._loop.run_until_complete(task) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper +(APIServer pid=1) return await main +(APIServer pid=1) ^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server +(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker +(APIServer pid=1) async with build_async_engine_client( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ +(APIServer pid=1) return await anext(self.gen) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client +(APIServer pid=1) async with build_async_engine_client_from_engine_args( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ +(APIServer pid=1) return await anext(self.gen) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 136, in build_async_engine_client_from_engine_args +(APIServer pid=1) async_llm = AsyncLLM.from_vllm_config( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 225, in from_vllm_config +(APIServer pid=1) return cls( +(APIServer pid=1) ^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 154, in __init__ +(APIServer pid=1) self.engine_core = EngineCoreClient.make_async_mp_client( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(APIServer pid=1) return func(*args, **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 130, in make_async_mp_client +(APIServer pid=1) return AsyncMPClient(*client_args) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(APIServer pid=1) return func(*args, **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 887, in __init__ +(APIServer pid=1) super().__init__( +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 535, in __init__ +(APIServer pid=1) with launch_core_engines( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 144, in __exit__ +(APIServer pid=1) next(self.gen) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 998, in launch_core_engines +(APIServer pid=1) wait_for_engine_startup( +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 1057, in wait_for_engine_startup +(APIServer pid=1) raise RuntimeError( +(APIServer pid=1) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} +/usr/lib/python3.12/multiprocessing/resource_tracker.py:279: UserWarning: resource_tracker: There appear to be 1 leaked semaphore objects to clean up at shutdown + warnings.warn('resource_tracker: There appear to be %d ' +/usr/lib/python3.12/multiprocessing/resource_tracker.py:279: UserWarning: resource_tracker: There appear to be 1 leaked shared_memory objects to clean up at shutdown + warnings.warn('resource_tracker: There appear to be %d ' diff --git a/accuracy/results/v0.19.0/logs/meta-llama-llama-4-scout--h100-80gb--tp4pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/meta-llama-llama-4-scout--h100-80gb--tp4pp1dp1--8192.log new file mode 100644 index 00000000..bf98beb9 --- /dev/null +++ b/accuracy/results/v0.19.0/logs/meta-llama-llama-4-scout--h100-80gb--tp4pp1dp1--8192.log @@ -0,0 +1,2342 @@ +DEBUG 04-23 00:11:13 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-23 00:11:13 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-23 00:11:13 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-23 00:11:13 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-23 00:11:13 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-23 00:11:13 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-23 00:11:13 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-23 00:11:13 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-23 00:11:13 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-23 00:11:13 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-23 00:11:13 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-23 00:11:13 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-23 00:11:18 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-23 00:11:20 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' +DEBUG 04-23 00:11:20 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-23 00:11:20 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-23 00:11:20 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-23 00:11:20 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-23 00:11:20 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-23 00:11:20 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-23 00:11:20 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-23 00:11:20 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-4-Scout-17B-16E-Instruct +(APIServer pid=1) INFO 04-23 00:11:20 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-23 00:11:20 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-23 00:11:20 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-4-Scout-17B-16E-Instruct', 'model': 'meta-llama/Llama-4-Scout-17B-16E-Instruct', 'max_model_len': 8192, 'tensor_parallel_size': 4, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-23 00:11:20 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-23 00:11:20 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.mllama4.Llama4ForConditionalGeneration from cache +(APIServer pid=1) DEBUG 04-23 00:11:20 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0015249 secs +(APIServer pid=1) INFO 04-23 00:11:20 [config/model.py:549] Resolved architecture: Llama4ForConditionalGeneration +(APIServer pid=1) INFO 04-23 00:11:20 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-23 00:11:20 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-23 00:11:20 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-23 00:11:20 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-23 00:11:20 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-23 00:11:20 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-23 00:11:20 [config/parallel.py:743] Defaulting to use mp for distributed inference +(APIServer pid=1) INFO 04-23 00:11:20 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-23 00:11:20 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(APIServer pid=1) INFO 04-23 00:11:22 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(APIServer pid=1) DEBUG 04-23 00:11:22 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-23 00:11:22 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-23 00:11:23 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-23 00:11:23 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(APIServer pid=1) DEBUG 04-23 00:11:23 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(APIServer pid=1) DEBUG 04-23 00:11:24 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. +(APIServer pid=1) DEBUG 04-23 00:11:24 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-23 00:11:34 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-23 00:11:34 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-23 00:11:34 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-23 00:11:34 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-23 00:11:34 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-23 00:11:34 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-23 00:11:34 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-23 00:11:34 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-23 00:11:34 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-23 00:11:34 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-23 00:11:34 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-23 00:11:34 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-23 00:11:38 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=469) DEBUG 04-23 00:11:40 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-23 00:11:40 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=469) DEBUG 04-23 00:11:40 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/de95e44d-b5c4-4856-85f7-6227a379d9a2'], outputs=['ipc:///tmp/a92eb3b7-4c3e-4857-8820-5bf77663f228'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=469) DEBUG 04-23 00:11:40 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=469) DEBUG 04-23 00:11:40 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=469) DEBUG 04-23 00:11:40 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=469) DEBUG 04-23 00:11:40 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=469) DEBUG 04-23 00:11:40 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=469) INFO 04-23 00:11:40 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-4-Scout-17B-16E-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-4-Scout-17B-16E-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=4, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-4-Scout-17B-16E-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [204, 8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=469) WARNING 04-23 00:11:40 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. +(EngineCore pid=469) INFO 04-23 00:11:40 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.128.11.101 (local), world_size=4, local_world_size=4 +(EngineCore pid=469) DEBUG 04-23 00:11:40 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/fe9552c7-f71c-49a7-a745-1a0d33ad6db5 +(EngineCore pid=469) DEBUG 04-23 00:11:40 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1, 2, 3], buffer_handle=(4, 16777216, 10, 'psm_4a7b0a34'), local_subscribe_addr='ipc:///tmp/fe9552c7-f71c-49a7-a745-1a0d33ad6db5', local_notify_addr='ipc:///tmp/e2411317-cee2-4e4e-8b72-6f10ce156f0f', remote_subscribe_addr=None, remote_addr_ipv6=False) +DEBUG 04-23 00:11:43 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-23 00:11:43 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-23 00:11:43 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-23 00:11:43 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-23 00:11:43 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-23 00:11:43 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-23 00:11:43 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-23 00:11:43 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-23 00:11:43 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-23 00:11:43 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-23 00:11:43 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-23 00:11:43 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-23 00:11:43 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-23 00:11:43 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-23 00:11:43 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-23 00:11:43 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-23 00:11:43 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-23 00:11:43 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-23 00:11:43 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-23 00:11:43 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-23 00:11:43 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-23 00:11:43 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-23 00:11:43 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-23 00:11:43 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-23 00:11:43 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-23 00:11:43 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-23 00:11:43 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-23 00:11:43 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-23 00:11:43 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-23 00:11:43 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-23 00:11:43 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-23 00:11:43 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-23 00:11:43 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-23 00:11:43 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-23 00:11:43 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-23 00:11:43 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-23 00:11:43 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-23 00:11:43 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-23 00:11:43 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-23 00:11:43 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-23 00:11:44 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-23 00:11:44 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-23 00:11:44 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-23 00:11:44 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-23 00:11:44 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-23 00:11:44 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-23 00:11:44 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-23 00:11:44 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-23 00:11:48 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-23 00:11:48 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-23 00:11:48 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-23 00:11:49 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-23 00:11:50 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-23 00:11:50 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-23 00:11:50 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-23 00:11:50 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-23 00:11:50 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-23 00:11:50 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-23 00:11:50 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-23 00:11:50 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-23 00:11:50 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-23 00:11:50 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-23 00:11:50 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-23 00:11:50 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) DEBUG 04-23 00:11:50 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +DEBUG 04-23 00:11:50 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +DEBUG 04-23 00:11:50 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +DEBUG 04-23 00:11:50 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +DEBUG 04-23 00:11:50 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +DEBUG 04-23 00:11:50 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-23 00:11:50 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-23 00:11:50 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-23 00:11:50 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-23 00:11:50 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +DEBUG 04-23 00:11:50 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +DEBUG 04-23 00:11:50 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +DEBUG 04-23 00:11:50 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +DEBUG 04-23 00:11:50 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +DEBUG 04-23 00:11:50 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +DEBUG 04-23 00:11:50 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +DEBUG 04-23 00:11:51 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(Worker pid=668) DEBUG 04-23 00:11:52 [distributed/parallel_state.py:1356] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:39093 backend=nccl +(Worker pid=668) INFO 04-23 00:11:52 [distributed/parallel_state.py:1400] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:39093 backend=nccl +(Worker pid=670) DEBUG 04-23 00:11:52 [distributed/parallel_state.py:1356] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:39093 backend=nccl +(Worker pid=670) INFO 04-23 00:11:52 [distributed/parallel_state.py:1400] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:39093 backend=nccl +(Worker pid=671) DEBUG 04-23 00:11:52 [distributed/parallel_state.py:1356] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:39093 backend=nccl +(Worker pid=671) INFO 04-23 00:11:52 [distributed/parallel_state.py:1400] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:39093 backend=nccl +(Worker pid=669) DEBUG 04-23 00:11:52 [distributed/parallel_state.py:1356] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:39093 backend=nccl +(Worker pid=669) INFO 04-23 00:11:52 [distributed/parallel_state.py:1400] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:39093 backend=nccl +[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +(Worker pid=669) DEBUG 04-23 00:11:52 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=668) DEBUG 04-23 00:11:52 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=670) DEBUG 04-23 00:11:52 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=671) DEBUG 04-23 00:11:52 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +(Worker pid=668) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=668) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=669) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=669) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=668) DEBUG 04-23 00:11:52 [utils/nccl.py:34] Found nccl from library libnccl.so.2 +(Worker pid=668) INFO 04-23 00:11:52 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 +(Worker pid=670) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=670) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=671) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=671) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=670) DEBUG 04-23 00:11:54 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=671) DEBUG 04-23 00:11:54 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=669) DEBUG 04-23 00:11:54 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=668) DEBUG 04-23 00:11:54 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=668) DEBUG 04-23 00:11:54 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/03183ada-28c6-4fcd-ad5b-2f4670d315d5 +(Worker pid=668) DEBUG 04-23 00:11:54 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1, 2, 3], buffer_handle=(3, 4194304, 6, 'psm_a0da278d'), local_subscribe_addr='ipc:///tmp/03183ada-28c6-4fcd-ad5b-2f4670d315d5', local_notify_addr='ipc:///tmp/7066347a-f7cf-46f0-98ac-1b6df273b035', remote_subscribe_addr=None, remote_addr_ipv6=False) +(Worker pid=670) DEBUG 04-23 00:11:54 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/03183ada-28c6-4fcd-ad5b-2f4670d315d5 +(Worker pid=669) DEBUG 04-23 00:11:54 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/03183ada-28c6-4fcd-ad5b-2f4670d315d5 +(Worker pid=671) DEBUG 04-23 00:11:54 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/03183ada-28c6-4fcd-ad5b-2f4670d315d5 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 +(Worker pid=668) INFO 04-23 00:11:54 [distributed/parallel_state.py:1716] rank 0 in world size 4 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N/A +(Worker pid=669) DEBUG 04-23 00:11:55 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.12GiB, total_memory=79.19GiB, cuda_memory=2.07GiB, torch_memory=0.02GiB, non_torch_memory=2.05GiB, timestamp=1776903115.0043905, auto_measure=True +(Worker pid=669) DEBUG 04-23 00:11:55 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=670) DEBUG 04-23 00:11:55 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.12GiB, total_memory=79.19GiB, cuda_memory=2.07GiB, torch_memory=0.02GiB, non_torch_memory=2.05GiB, timestamp=1776903115.0138264, auto_measure=True +(Worker pid=670) DEBUG 04-23 00:11:55 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=668) DEBUG 04-23 00:11:55 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.12GiB, total_memory=79.19GiB, cuda_memory=2.07GiB, torch_memory=0.02GiB, non_torch_memory=2.05GiB, timestamp=1776903115.0193129, auto_measure=True +(Worker pid=668) DEBUG 04-23 00:11:55 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=671) DEBUG 04-23 00:11:55 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.12GiB, total_memory=79.19GiB, cuda_memory=2.07GiB, torch_memory=0.02GiB, non_torch_memory=2.05GiB, timestamp=1776903115.032611, auto_measure=True +(Worker pid=671) DEBUG 04-23 00:11:55 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=669) DEBUG 04-23 00:11:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=669) DEBUG 04-23 00:11:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=670) DEBUG 04-23 00:11:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=670) DEBUG 04-23 00:11:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=668) DEBUG 04-23 00:11:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=668) DEBUG 04-23 00:11:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=671) DEBUG 04-23 00:11:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=671) DEBUG 04-23 00:11:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=669) DEBUG 04-23 00:11:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=670) DEBUG 04-23 00:11:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=668) DEBUG 04-23 00:11:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=668) DEBUG 04-23 00:11:55 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(Worker pid=669) DEBUG 04-23 00:11:55 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=670) DEBUG 04-23 00:11:55 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=671) DEBUG 04-23 00:11:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=668) DEBUG 04-23 00:11:55 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=671) DEBUG 04-23 00:11:55 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(APIServer pid=1) DEBUG 04-23 00:12:00 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker pid=668) DEBUG 04-23 00:12:01 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(Worker_TP0 pid=668) INFO 04-23 00:12:01 [v1/worker/gpu_model_runner.py:4735] Starting to load model meta-llama/Llama-4-Scout-17B-16E-Instruct... +(Worker_TP0 pid=668) INFO 04-23 00:12:01 [platforms/cuda.py:390] Using backend AttentionBackendEnum.FLASH_ATTN for vit attention +(Worker_TP0 pid=668) INFO 04-23 00:12:01 [model_executor/.../attention/mm_encoder_attention.py:230] Using AttentionBackendEnum.FLASH_ATTN for MMEncoderAttention. +(Worker_TP0 pid=668) INFO 04-23 00:12:01 [config/vllm.py:790] Asynchronous scheduling is enabled. +(Worker_TP0 pid=668) INFO 04-23 00:12:01 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(Worker_TP0 pid=668) DEBUG 04-23 00:12:01 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(Worker_TP0 pid=668) INFO 04-23 00:12:01 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(Worker_TP0 pid=668) INFO 04-23 00:12:01 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(Worker_TP0 pid=668) INFO 04-23 00:12:01 [model_executor/.../oracle/unquantized.py:186] Using TRITON backend for Unquantized MoE +(Worker_TP0 pid=668) DEBUG 04-23 00:12:01 [model_executor/.../runner/default_moe_runner.py:240] Enabled separate cuda stream for MoE shared_experts +(Worker_TP0 pid=668) DEBUG 04-23 00:12:02 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP0 pid=668) DEBUG 04-23 00:12:02 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP0 pid=668) DEBUG 04-23 00:12:02 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 34}) +(Worker_TP0 pid=668) DEBUG 04-23 00:12:02 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 133, 'silu_and_mul': 48, 'fused_moe': 48, 'unquantized_fused_moe': 48, 'rotary_embedding': 2, 'apply_rotary_emb': 2, 'vocab_parallel_embedding': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP0 pid=668) DEBUG 04-23 00:12:02 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 34}) +(Worker_TP0 pid=668) DEBUG 04-23 00:12:02 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 133, 'silu_and_mul': 48, 'fused_moe': 48, 'unquantized_fused_moe': 48, 'rotary_embedding': 2, 'apply_rotary_emb': 2, 'vocab_parallel_embedding': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP0 pid=668) DEBUG 04-23 00:12:02 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP0 pid=668) DEBUG 04-23 00:12:02 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00047-of-00050.safetensors', 'model-00006-of-00050.safetensors', 'model-00002-of-00050.safetensors', 'model-00050-of-00050.safetensors', 'model-00001-of-00050.safetensors', 'model-00013-of-00050.safetensors', 'model-00036-of-00050.safetensors', 'model-00039-of-00050.safetensors', 'model-00015-of-00050.safetensors', 'model-00048-of-00050.safetensors', 'model-00020-of-00050.safetensors', 'model-00042-of-00050.safetensors', 'model-00004-of-00050.safetensors', 'model-00027-of-00050.safetensors', 'model-00041-of-00050.safetensors', 'model-00011-of-00050.safetensors', 'model-00032-of-00050.safetensors', 'model-00016-of-00050.safetensors', 'model-00018-of-00050.safetensors', 'model-00049-of-00050.safetensors', 'model-00028-of-00050.safetensors', 'model-00031-of-00050.safetensors', 'model-00033-of-00050.safetensors', 'model-00035-of-00050.safetensors', 'model-00017-of-00050.safetensors', 'model-00037-of-00050.safetensors', 'model-00045-of-00050.safetensors', 'model-00021-of-00050.safetensors', 'model-00026-of-00050.safetensors', 'model-00034-of-00050.safetensors', 'model-00008-of-00050.safetensors', 'model-00019-of-00050.safetensors', 'model-00029-of-00050.safetensors', 'model-00014-of-00050.safetensors', 'model-00040-of-00050.safetensors', 'model-00030-of-00050.safetensors', 'model-00024-of-00050.safetensors', 'model-00007-of-00050.safetensors', 'model-00025-of-00050.safetensors', 'model-00038-of-00050.safetensors', 'model-00043-of-00050.safetensors', 'model-00046-of-00050.safetensors', 'model-00005-of-00050.safetensors', 'model-00022-of-00050.safetensors', 'model-00044-of-00050.safetensors', 'model-00009-of-00050.safetensors', 'model-00012-of-00050.safetensors', 'model-00010-of-00050.safetensors', 'model-00023-of-00050.safetensors', 'model-00003-of-00050.safetensors']] +(Worker_TP2 pid=670) DEBUG 04-23 00:12:02 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP2 pid=670) DEBUG 04-23 00:12:02 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP2 pid=670) DEBUG 04-23 00:12:02 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 34}) +(Worker_TP2 pid=670) DEBUG 04-23 00:12:02 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 133, 'silu_and_mul': 48, 'fused_moe': 48, 'unquantized_fused_moe': 48, 'rotary_embedding': 2, 'apply_rotary_emb': 2, 'vocab_parallel_embedding': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP2 pid=670) DEBUG 04-23 00:12:02 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 34}) +(Worker_TP2 pid=670) DEBUG 04-23 00:12:02 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 133, 'silu_and_mul': 48, 'fused_moe': 48, 'unquantized_fused_moe': 48, 'rotary_embedding': 2, 'apply_rotary_emb': 2, 'vocab_parallel_embedding': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP2 pid=670) DEBUG 04-23 00:12:02 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP3 pid=671) DEBUG 04-23 00:12:02 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP3 pid=671) DEBUG 04-23 00:12:02 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP1 pid=669) DEBUG 04-23 00:12:02 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP1 pid=669) DEBUG 04-23 00:12:02 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP3 pid=671) DEBUG 04-23 00:12:02 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 34}) +(Worker_TP3 pid=671) DEBUG 04-23 00:12:02 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 133, 'silu_and_mul': 48, 'fused_moe': 48, 'unquantized_fused_moe': 48, 'rotary_embedding': 2, 'apply_rotary_emb': 2, 'vocab_parallel_embedding': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP1 pid=669) DEBUG 04-23 00:12:02 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 34}) +(Worker_TP1 pid=669) DEBUG 04-23 00:12:02 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 133, 'silu_and_mul': 48, 'fused_moe': 48, 'unquantized_fused_moe': 48, 'rotary_embedding': 2, 'apply_rotary_emb': 2, 'vocab_parallel_embedding': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP3 pid=671) DEBUG 04-23 00:12:02 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 34}) +(Worker_TP3 pid=671) DEBUG 04-23 00:12:02 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 133, 'silu_and_mul': 48, 'fused_moe': 48, 'unquantized_fused_moe': 48, 'rotary_embedding': 2, 'apply_rotary_emb': 2, 'vocab_parallel_embedding': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP3 pid=671) DEBUG 04-23 00:12:02 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP1 pid=669) DEBUG 04-23 00:12:02 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 34}) +(Worker_TP1 pid=669) DEBUG 04-23 00:12:02 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 133, 'silu_and_mul': 48, 'fused_moe': 48, 'unquantized_fused_moe': 48, 'rotary_embedding': 2, 'apply_rotary_emb': 2, 'vocab_parallel_embedding': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP1 pid=669) DEBUG 04-23 00:12:02 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP2 pid=670) DEBUG 04-23 00:12:02 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00045-of-00050.safetensors', 'model-00034-of-00050.safetensors', 'model-00043-of-00050.safetensors', 'model-00046-of-00050.safetensors', 'model-00008-of-00050.safetensors', 'model-00032-of-00050.safetensors', 'model-00040-of-00050.safetensors', 'model-00042-of-00050.safetensors', 'model-00022-of-00050.safetensors', 'model-00035-of-00050.safetensors', 'model-00037-of-00050.safetensors', 'model-00050-of-00050.safetensors', 'model-00001-of-00050.safetensors', 'model-00033-of-00050.safetensors', 'model-00007-of-00050.safetensors', 'model-00017-of-00050.safetensors', 'model-00024-of-00050.safetensors', 'model-00027-of-00050.safetensors', 'model-00039-of-00050.safetensors', 'model-00015-of-00050.safetensors', 'model-00002-of-00050.safetensors', 'model-00004-of-00050.safetensors', 'model-00025-of-00050.safetensors', 'model-00048-of-00050.safetensors', 'model-00023-of-00050.safetensors', 'model-00010-of-00050.safetensors', 'model-00028-of-00050.safetensors', 'model-00049-of-00050.safetensors', 'model-00018-of-00050.safetensors', 'model-00012-of-00050.safetensors', 'model-00029-of-00050.safetensors', 'model-00016-of-00050.safetensors', 'model-00044-of-00050.safetensors', 'model-00006-of-00050.safetensors', 'model-00038-of-00050.safetensors', 'model-00041-of-00050.safetensors', 'model-00005-of-00050.safetensors', 'model-00036-of-00050.safetensors', 'model-00026-of-00050.safetensors', 'model-00030-of-00050.safetensors', 'model-00031-of-00050.safetensors', 'model-00003-of-00050.safetensors', 'model-00021-of-00050.safetensors', 'model-00014-of-00050.safetensors', 'model-00019-of-00050.safetensors', 'model-00011-of-00050.safetensors', 'model-00020-of-00050.safetensors', 'model-00009-of-00050.safetensors', 'model-00013-of-00050.safetensors', 'model-00047-of-00050.safetensors']] +(Worker_TP3 pid=671) DEBUG 04-23 00:12:02 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00023-of-00050.safetensors', 'model-00006-of-00050.safetensors', 'model-00010-of-00050.safetensors', 'model-00021-of-00050.safetensors', 'model-00015-of-00050.safetensors', 'model-00001-of-00050.safetensors', 'model-00046-of-00050.safetensors', 'model-00017-of-00050.safetensors', 'model-00003-of-00050.safetensors', 'model-00034-of-00050.safetensors', 'model-00005-of-00050.safetensors', 'model-00048-of-00050.safetensors', 'model-00004-of-00050.safetensors', 'model-00008-of-00050.safetensors', 'model-00014-of-00050.safetensors', 'model-00043-of-00050.safetensors', 'model-00028-of-00050.safetensors', 'model-00012-of-00050.safetensors', 'model-00018-of-00050.safetensors', 'model-00031-of-00050.safetensors', 'model-00042-of-00050.safetensors', 'model-00047-of-00050.safetensors', 'model-00024-of-00050.safetensors', 'model-00030-of-00050.safetensors', 'model-00041-of-00050.safetensors', 'model-00035-of-00050.safetensors', 'model-00026-of-00050.safetensors', 'model-00009-of-00050.safetensors', 'model-00045-of-00050.safetensors', 'model-00007-of-00050.safetensors', 'model-00027-of-00050.safetensors', 'model-00016-of-00050.safetensors', 'model-00029-of-00050.safetensors', 'model-00033-of-00050.safetensors', 'model-00049-of-00050.safetensors', 'model-00020-of-00050.safetensors', 'model-00036-of-00050.safetensors', 'model-00019-of-00050.safetensors', 'model-00002-of-00050.safetensors', 'model-00039-of-00050.safetensors', 'model-00050-of-00050.safetensors', 'model-00011-of-00050.safetensors', 'model-00025-of-00050.safetensors', 'model-00022-of-00050.safetensors', 'model-00038-of-00050.safetensors', 'model-00013-of-00050.safetensors', 'model-00040-of-00050.safetensors', 'model-00032-of-00050.safetensors', 'model-00044-of-00050.safetensors', 'model-00037-of-00050.safetensors']] +(Worker_TP1 pid=669) DEBUG 04-23 00:12:02 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00001-of-00050.safetensors', 'model-00021-of-00050.safetensors', 'model-00046-of-00050.safetensors', 'model-00012-of-00050.safetensors', 'model-00004-of-00050.safetensors', 'model-00018-of-00050.safetensors', 'model-00030-of-00050.safetensors', 'model-00037-of-00050.safetensors', 'model-00017-of-00050.safetensors', 'model-00029-of-00050.safetensors', 'model-00032-of-00050.safetensors', 'model-00040-of-00050.safetensors', 'model-00007-of-00050.safetensors', 'model-00002-of-00050.safetensors', 'model-00015-of-00050.safetensors', 'model-00028-of-00050.safetensors', 'model-00026-of-00050.safetensors', 'model-00043-of-00050.safetensors', 'model-00013-of-00050.safetensors', 'model-00006-of-00050.safetensors', 'model-00024-of-00050.safetensors', 'model-00011-of-00050.safetensors', 'model-00020-of-00050.safetensors', 'model-00008-of-00050.safetensors', 'model-00031-of-00050.safetensors', 'model-00010-of-00050.safetensors', 'model-00049-of-00050.safetensors', 'model-00009-of-00050.safetensors', 'model-00022-of-00050.safetensors', 'model-00034-of-00050.safetensors', 'model-00048-of-00050.safetensors', 'model-00019-of-00050.safetensors', 'model-00025-of-00050.safetensors', 'model-00036-of-00050.safetensors', 'model-00041-of-00050.safetensors', 'model-00035-of-00050.safetensors', 'model-00005-of-00050.safetensors', 'model-00039-of-00050.safetensors', 'model-00045-of-00050.safetensors', 'model-00038-of-00050.safetensors', 'model-00047-of-00050.safetensors', 'model-00023-of-00050.safetensors', 'model-00016-of-00050.safetensors', 'model-00042-of-00050.safetensors', 'model-00044-of-00050.safetensors', 'model-00033-of-00050.safetensors', 'model-00014-of-00050.safetensors', 'model-00050-of-00050.safetensors', 'model-00003-of-00050.safetensors', 'model-00027-of-00050.safetensors']] +(APIServer pid=1) DEBUG 04-23 00:12:10 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:12:20 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:12:30 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:12:40 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:12:50 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:13:00 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:13:10 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:13:20 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:13:30 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:13:40 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:13:50 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:14:00 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:14:10 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:14:20 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:14:30 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:14:40 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:14:50 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:15:00 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:15:10 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:15:20 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:15:30 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:15:40 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:15:50 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:16:00 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:16:10 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:16:20 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:16:30 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:16:40 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:16:50 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:17:00 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:17:10 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:17:20 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:17:30 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:17:40 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:17:50 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:18:00 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:18:10 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:18:20 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:18:30 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:18:40 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:18:50 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:19:00 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:19:10 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:19:20 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:19:30 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:19:40 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:19:50 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:20:00 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:20:10 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 00:20:20 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=668) INFO 04-23 00:20:22 [model_executor/model_loader/weight_utils.py:581] Time spent downloading weights for meta-llama/Llama-4-Scout-17B-16E-Instruct: 500.340780 seconds +(Worker_TP0 pid=668) Loading safetensors checkpoint shards: 0% Completed | 0/50 [00:00 +(Worker_TP1 pid=669) DEBUG 04-23 00:27:37 [compilation/decorators.py:528] Start compiling function +(Worker_TP0 pid=668) DEBUG 04-23 00:27:37 [compilation/decorators.py:528] Start compiling function +(Worker_TP2 pid=670) DEBUG 04-23 00:27:37 [compilation/decorators.py:528] Start compiling function +(APIServer pid=1) DEBUG 04-23 00:27:41 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/forward_context.py +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/config.py +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/shared_fused_moe.py +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama4.py +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/forward_context.py +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/config.py +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/shared_fused_moe.py +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama4.py +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=3084af9694 comp=e546579c48 code=d8a8fd34540a53acd8b823bdc994eaf99e6b69c42a6dc1372178a61fa194a68c dir=/data/.cache/vllm/torch_compile_cache/a31c82151b/rank_1_0/backbone +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=3084af9694 comp=e546579c48 code=d8a8fd34540a53acd8b823bdc994eaf99e6b69c42a6dc1372178a61fa194a68c dir=/data/.cache/vllm/torch_compile_cache/a31c82151b/rank_3_0/backbone +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] Vllm config hash: 3084af9694 +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] Vllm config hash: 3084af9694 +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/forward_context.py +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/config.py +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/shared_fused_moe.py +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama4.py +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP0 pid=668) INFO 04-23 00:27:44 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/a31c82151b/rank_0_0/backbone for vLLM's torch.compile +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=3084af9694 comp=e546579c48 code=d8a8fd34540a53acd8b823bdc994eaf99e6b69c42a6dc1372178a61fa194a68c dir=/data/.cache/vllm/torch_compile_cache/a31c82151b/rank_0_0/backbone +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] Vllm config hash: 3084af9694 +(Worker_TP0 pid=668) INFO 04-23 00:27:44 [compilation/backends.py:1111] Dynamo bytecode transform time: 6.65 s +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/.../fusion/allreduce_rms_fusion.py:765] Flashinfer max size: 2 MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: 204 +(Worker_TP0 pid=668) INFO 04-23 00:27:44 [distributed/device_communicators/flashinfer_all_reduce.py:109] Auto-selected flashinfer allreduce backend: trtllm +(Worker_TP0 pid=668) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. +(Worker_TP0 pid=668) return func(*args, **kwargs) +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/forward_context.py +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/config.py +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/shared_fused_moe.py +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama4.py +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=3084af9694 comp=e546579c48 code=d8a8fd34540a53acd8b823bdc994eaf99e6b69c42a6dc1372178a61fa194a68c dir=/data/.cache/vllm/torch_compile_cache/a31c82151b/rank_2_0/backbone +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] Vllm config hash: 3084af9694 +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=0, max_token_num=204, hidden_dim=5120, dtype=torch.bfloat16 +(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=1, max_token_num=204, hidden_dim=5120, dtype=torch.bfloat16 +(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=2, max_token_num=204, hidden_dim=5120, dtype=torch.bfloat16 +(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=3, max_token_num=204, hidden_dim=5120, dtype=torch.bfloat16 +(Worker_TP0 pid=668) INFO 04-23 00:27:44 [distributed/device_communicators/flashinfer_all_reduce.py:149] Initialized FlashInfer Allreduce norm fusion workspace with backend=trtllm +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 204), (205, 8192)] +(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(Worker_TP3 pid=671) DEBUG 04-23 00:27:45 [compilation/.../utility/noop_elimination.py:105] Removed 2 no-op reshapes and slices +(Worker_TP3 pid=671) DEBUG 04-23 00:27:45 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP3 pid=671) DEBUG 04-23 00:27:45 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 0 patterns +(Worker_TP3 pid=671) DEBUG 04-23 00:27:45 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 1.8 ms +(Worker_TP3 pid=671) DEBUG 04-23 00:27:45 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP3 pid=671) DEBUG 04-23 00:27:45 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP3 pid=671) DEBUG 04-23 00:27:45 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:27:45 [compilation/.../utility/noop_elimination.py:105] Removed 2 no-op reshapes and slices +(Worker_TP0 pid=668) DEBUG 04-23 00:27:45 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:27:45 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 0 patterns +(Worker_TP0 pid=668) DEBUG 04-23 00:27:45 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 1.7 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:27:45 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:27:45 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=668) DEBUG 04-23 00:27:45 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(Worker_TP2 pid=670) DEBUG 04-23 00:27:45 [compilation/.../utility/noop_elimination.py:105] Removed 2 no-op reshapes and slices +(Worker_TP2 pid=670) DEBUG 04-23 00:27:45 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP2 pid=670) DEBUG 04-23 00:27:45 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 0 patterns +(Worker_TP2 pid=670) DEBUG 04-23 00:27:45 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 1.9 ms +(Worker_TP2 pid=670) DEBUG 04-23 00:27:45 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP2 pid=670) DEBUG 04-23 00:27:45 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP2 pid=670) DEBUG 04-23 00:27:45 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(Worker_TP1 pid=669) DEBUG 04-23 00:27:46 [compilation/.../utility/noop_elimination.py:105] Removed 2 no-op reshapes and slices +(Worker_TP1 pid=669) DEBUG 04-23 00:27:46 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP1 pid=669) DEBUG 04-23 00:27:46 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 0 patterns +(Worker_TP1 pid=669) DEBUG 04-23 00:27:46 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 2.0 ms +(Worker_TP1 pid=669) DEBUG 04-23 00:27:46 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP1 pid=669) DEBUG 04-23 00:27:46 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=669) DEBUG 04-23 00:27:46 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(APIServer pid=1) DEBUG 04-23 00:27:51 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=668) INFO 04-23 00:27:53 [compilation/backends.py:372] Cache the graph of compile range (1, 204) for later use +(Worker_TP0 pid=668) DEBUG 04-23 00:27:53 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 204) from inductor_standalone via handle ('artifact_compile_range_1_204_subgraph_0', '/data/.cache/vllm/torch_compile_cache/a31c82151b/rank_0_0/backbone/artifact_compile_range_1_204_subgraph_0') +(Worker_TP1 pid=669) DEBUG 04-23 00:27:53 [compilation/.../utility/noop_elimination.py:105] Removed 2 no-op reshapes and slices +(Worker_TP1 pid=669) DEBUG 04-23 00:27:53 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP1 pid=669) DEBUG 04-23 00:27:53 [compilation/passes/pass_manager.py:100] Skipping with compile range (205, 8192) +(Worker_TP1 pid=669) DEBUG 04-23 00:27:53 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=669) DEBUG 04-23 00:27:53 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=669) DEBUG 04-23 00:27:53 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(Worker_TP2 pid=670) DEBUG 04-23 00:27:53 [compilation/.../utility/noop_elimination.py:105] Removed 2 no-op reshapes and slices +(Worker_TP2 pid=670) DEBUG 04-23 00:27:53 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.1 ms +(Worker_TP2 pid=670) DEBUG 04-23 00:27:53 [compilation/passes/pass_manager.py:100] Skipping with compile range (205, 8192) +(Worker_TP2 pid=670) DEBUG 04-23 00:27:53 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP2 pid=670) DEBUG 04-23 00:27:53 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP2 pid=670) DEBUG 04-23 00:27:53 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(Worker_TP3 pid=671) DEBUG 04-23 00:27:53 [compilation/.../utility/noop_elimination.py:105] Removed 2 no-op reshapes and slices +(Worker_TP3 pid=671) DEBUG 04-23 00:27:53 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP3 pid=671) DEBUG 04-23 00:27:53 [compilation/passes/pass_manager.py:100] Skipping with compile range (205, 8192) +(Worker_TP3 pid=671) DEBUG 04-23 00:27:53 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP3 pid=671) DEBUG 04-23 00:27:53 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP3 pid=671) DEBUG 04-23 00:27:53 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:27:53 [compilation/.../utility/noop_elimination.py:105] Removed 2 no-op reshapes and slices +(Worker_TP0 pid=668) DEBUG 04-23 00:27:53 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:27:53 [compilation/passes/pass_manager.py:100] Skipping with compile range (205, 8192) +(Worker_TP0 pid=668) DEBUG 04-23 00:27:53 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:27:53 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=668) DEBUG 04-23 00:27:53 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(Worker_TP0 pid=668) INFO 04-23 00:27:54 [compilation/backends.py:372] Cache the graph of compile range (205, 8192) for later use +(Worker_TP0 pid=668) DEBUG 04-23 00:27:54 [compilation/backends.py:377] Store the 0-th graph for compile range(205, 8192) from inductor_standalone via handle ('artifact_compile_range_205_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/a31c82151b/rank_0_0/backbone/artifact_compile_range_205_8192_subgraph_0') +(Worker_TP1 pid=669) DEBUG 04-23 00:27:54 [compilation/.../utility/noop_elimination.py:105] Removed 2 no-op reshapes and slices +(Worker_TP1 pid=669) DEBUG 04-23 00:27:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP2 pid=670) DEBUG 04-23 00:27:54 [compilation/.../utility/noop_elimination.py:105] Removed 2 no-op reshapes and slices +(Worker_TP2 pid=670) DEBUG 04-23 00:27:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP1 pid=669) DEBUG 04-23 00:27:54 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP1 pid=669) DEBUG 04-23 00:27:54 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 41.2 ms +(Worker_TP1 pid=669) DEBUG 04-23 00:27:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=669) DEBUG 04-23 00:27:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP1 pid=669) DEBUG 04-23 00:27:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP2 pid=670) DEBUG 04-23 00:27:54 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP2 pid=670) DEBUG 04-23 00:27:54 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 41.6 ms +(Worker_TP2 pid=670) DEBUG 04-23 00:27:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP2 pid=670) DEBUG 04-23 00:27:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP2 pid=670) DEBUG 04-23 00:27:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP3 pid=671) DEBUG 04-23 00:27:54 [compilation/.../utility/noop_elimination.py:105] Removed 2 no-op reshapes and slices +(Worker_TP3 pid=671) DEBUG 04-23 00:27:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:27:54 [compilation/.../utility/noop_elimination.py:105] Removed 2 no-op reshapes and slices +(Worker_TP0 pid=668) DEBUG 04-23 00:27:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP3 pid=671) DEBUG 04-23 00:27:54 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP3 pid=671) DEBUG 04-23 00:27:54 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 42.2 ms +(Worker_TP3 pid=671) DEBUG 04-23 00:27:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP3 pid=671) DEBUG 04-23 00:27:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP3 pid=671) DEBUG 04-23 00:27:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:27:54 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP0 pid=668) DEBUG 04-23 00:27:54 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 41.6 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:27:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:27:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP0 pid=668) DEBUG 04-23 00:27:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:27:55 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 204) from inductor_standalone via handle ('artifact_compile_range_1_204_subgraph_1', '/data/.cache/vllm/torch_compile_cache/a31c82151b/rank_0_0/backbone/artifact_compile_range_1_204_subgraph_1') +(Worker_TP1 pid=669) DEBUG 04-23 00:27:55 [compilation/.../utility/noop_elimination.py:105] Removed 2 no-op reshapes and slices +(Worker_TP1 pid=669) DEBUG 04-23 00:27:55 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP1 pid=669) DEBUG 04-23 00:27:55 [compilation/passes/pass_manager.py:100] Skipping with compile range (205, 8192) +(Worker_TP1 pid=669) DEBUG 04-23 00:27:55 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP1 pid=669) DEBUG 04-23 00:27:55 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=669) DEBUG 04-23 00:27:55 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(Worker_TP2 pid=670) DEBUG 04-23 00:27:55 [compilation/.../utility/noop_elimination.py:105] Removed 2 no-op reshapes and slices +(Worker_TP2 pid=670) DEBUG 04-23 00:27:55 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP2 pid=670) DEBUG 04-23 00:27:55 [compilation/passes/pass_manager.py:100] Skipping with compile range (205, 8192) +(Worker_TP2 pid=670) DEBUG 04-23 00:27:55 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP2 pid=670) DEBUG 04-23 00:27:55 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP2 pid=670) DEBUG 04-23 00:27:55 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(Worker_TP3 pid=671) DEBUG 04-23 00:27:55 [compilation/.../utility/noop_elimination.py:105] Removed 2 no-op reshapes and slices +(Worker_TP3 pid=671) DEBUG 04-23 00:27:55 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP3 pid=671) DEBUG 04-23 00:27:55 [compilation/passes/pass_manager.py:100] Skipping with compile range (205, 8192) +(Worker_TP3 pid=671) DEBUG 04-23 00:27:55 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP3 pid=671) DEBUG 04-23 00:27:55 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP3 pid=671) DEBUG 04-23 00:27:55 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:27:55 [compilation/.../utility/noop_elimination.py:105] Removed 2 no-op reshapes and slices +(Worker_TP0 pid=668) DEBUG 04-23 00:27:55 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:27:55 [compilation/passes/pass_manager.py:100] Skipping with compile range (205, 8192) +(Worker_TP0 pid=668) DEBUG 04-23 00:27:55 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:27:55 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=668) DEBUG 04-23 00:27:55 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:27:58 [compilation/backends.py:377] Store the 1-th graph for compile range(205, 8192) from inductor_standalone via handle ('artifact_compile_range_205_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/a31c82151b/rank_0_0/backbone/artifact_compile_range_205_8192_subgraph_1') +(Worker_TP0 pid=668) DEBUG 04-23 00:27:59 [compilation/backends.py:377] Store the 2-th graph for compile range(1, 204) from inductor_standalone via handle ('artifact_compile_range_1_204_subgraph_2', '/data/.cache/vllm/torch_compile_cache/a31c82151b/rank_0_0/backbone/artifact_compile_range_1_204_subgraph_2') +(Worker_TP0 pid=668) DEBUG 04-23 00:27:59 [compilation/backends.py:377] Store the 2-th graph for compile range(205, 8192) from inductor_standalone via handle ('artifact_compile_range_205_8192_subgraph_2', '/data/.cache/vllm/torch_compile_cache/a31c82151b/rank_0_0/backbone/artifact_compile_range_205_8192_subgraph_2') +(Worker_TP1 pid=669) DEBUG 04-23 00:27:59 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=669) DEBUG 04-23 00:27:59 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(Worker_TP2 pid=670) DEBUG 04-23 00:27:59 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP2 pid=670) DEBUG 04-23 00:27:59 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(Worker_TP1 pid=669) DEBUG 04-23 00:27:59 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP1 pid=669) DEBUG 04-23 00:27:59 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.8 ms +(Worker_TP1 pid=669) DEBUG 04-23 00:27:59 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP1 pid=669) DEBUG 04-23 00:27:59 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP1 pid=669) DEBUG 04-23 00:27:59 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP2 pid=670) DEBUG 04-23 00:27:59 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP2 pid=670) DEBUG 04-23 00:27:59 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.3 ms +(Worker_TP2 pid=670) DEBUG 04-23 00:27:59 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP2 pid=670) DEBUG 04-23 00:27:59 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP2 pid=670) DEBUG 04-23 00:27:59 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:27:59 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=668) DEBUG 04-23 00:27:59 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms +(Worker_TP3 pid=671) DEBUG 04-23 00:27:59 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP3 pid=671) DEBUG 04-23 00:27:59 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:27:59 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP0 pid=668) DEBUG 04-23 00:27:59 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.7 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:27:59 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:27:59 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP0 pid=668) DEBUG 04-23 00:27:59 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP3 pid=671) DEBUG 04-23 00:28:00 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP3 pid=671) DEBUG 04-23 00:28:00 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 40.7 ms +(Worker_TP3 pid=671) DEBUG 04-23 00:28:00 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP3 pid=671) DEBUG 04-23 00:28:00 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP3 pid=671) DEBUG 04-23 00:28:00 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:28:00 [compilation/backends.py:377] Store the 3-th graph for compile range(1, 204) from inductor_standalone via handle ('artifact_compile_range_1_204_subgraph_3', '/data/.cache/vllm/torch_compile_cache/a31c82151b/rank_0_0/backbone/artifact_compile_range_1_204_subgraph_3') +(Worker_TP1 pid=669) DEBUG 04-23 00:28:00 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=669) DEBUG 04-23 00:28:00 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms +(Worker_TP1 pid=669) DEBUG 04-23 00:28:00 [compilation/passes/pass_manager.py:100] Skipping with compile range (205, 8192) +(Worker_TP1 pid=669) DEBUG 04-23 00:28:00 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP1 pid=669) DEBUG 04-23 00:28:00 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=669) DEBUG 04-23 00:28:00 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(Worker_TP2 pid=670) DEBUG 04-23 00:28:00 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP2 pid=670) DEBUG 04-23 00:28:00 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms +(Worker_TP2 pid=670) DEBUG 04-23 00:28:00 [compilation/passes/pass_manager.py:100] Skipping with compile range (205, 8192) +(Worker_TP2 pid=670) DEBUG 04-23 00:28:00 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP2 pid=670) DEBUG 04-23 00:28:00 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP2 pid=670) DEBUG 04-23 00:28:00 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:28:00 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=668) DEBUG 04-23 00:28:00 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:28:00 [compilation/passes/pass_manager.py:100] Skipping with compile range (205, 8192) +(Worker_TP0 pid=668) DEBUG 04-23 00:28:00 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:28:00 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=668) DEBUG 04-23 00:28:00 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.6 ms +(Worker_TP3 pid=671) DEBUG 04-23 00:28:00 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP3 pid=671) DEBUG 04-23 00:28:00 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms +(Worker_TP3 pid=671) DEBUG 04-23 00:28:00 [compilation/passes/pass_manager.py:100] Skipping with compile range (205, 8192) +(Worker_TP3 pid=671) DEBUG 04-23 00:28:00 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP3 pid=671) DEBUG 04-23 00:28:00 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP3 pid=671) DEBUG 04-23 00:28:00 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:28:00 [compilation/backends.py:377] Store the 3-th graph for compile range(205, 8192) from inductor_standalone via handle ('artifact_compile_range_205_8192_subgraph_3', '/data/.cache/vllm/torch_compile_cache/a31c82151b/rank_0_0/backbone/artifact_compile_range_205_8192_subgraph_3') +(APIServer pid=1) DEBUG 04-23 00:28:01 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP2 pid=670) DEBUG 04-23 00:28:03 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP2 pid=670) DEBUG 04-23 00:28:03 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP1 pid=669) DEBUG 04-23 00:28:04 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=669) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(Worker_TP2 pid=670) DEBUG 04-23 00:28:04 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP2 pid=670) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 37.1 ms +(Worker_TP2 pid=670) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms +(Worker_TP2 pid=670) DEBUG 04-23 00:28:04 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP2 pid=670) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP1 pid=669) DEBUG 04-23 00:28:04 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP1 pid=669) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.3 ms +(Worker_TP1 pid=669) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms +(Worker_TP1 pid=669) DEBUG 04-23 00:28:04 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP1 pid=669) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:28:04 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=668) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:28:04 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP0 pid=668) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 37.8 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:28:04 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP0 pid=668) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP3 pid=671) DEBUG 04-23 00:28:04 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP3 pid=671) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:28:04 [compilation/backends.py:377] Store the 48-th graph for compile range(1, 204) from inductor_standalone via handle ('artifact_compile_range_1_204_subgraph_48', '/data/.cache/vllm/torch_compile_cache/a31c82151b/rank_0_0/backbone/artifact_compile_range_1_204_subgraph_48') +(Worker_TP0 pid=668) INFO 04-23 00:28:04 [compilation/backends.py:390] Compiling a graph for compile range (1, 204) takes 11.13 s +(Worker_TP2 pid=670) DEBUG 04-23 00:28:04 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP2 pid=670) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms +(Worker_TP2 pid=670) DEBUG 04-23 00:28:04 [compilation/passes/pass_manager.py:100] Skipping with compile range (205, 8192) +(Worker_TP2 pid=670) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP2 pid=670) DEBUG 04-23 00:28:04 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP2 pid=670) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(Worker_TP3 pid=671) DEBUG 04-23 00:28:04 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP3 pid=671) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.4 ms +(Worker_TP3 pid=671) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms +(Worker_TP3 pid=671) DEBUG 04-23 00:28:04 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP3 pid=671) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP1 pid=669) DEBUG 04-23 00:28:04 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=669) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(Worker_TP1 pid=669) DEBUG 04-23 00:28:04 [compilation/passes/pass_manager.py:100] Skipping with compile range (205, 8192) +(Worker_TP1 pid=669) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP1 pid=669) DEBUG 04-23 00:28:04 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=669) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:28:04 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=668) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:28:04 [compilation/passes/pass_manager.py:100] Skipping with compile range (205, 8192) +(Worker_TP0 pid=668) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:28:04 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=668) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(Worker_TP3 pid=671) DEBUG 04-23 00:28:04 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP3 pid=671) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms +(Worker_TP3 pid=671) DEBUG 04-23 00:28:04 [compilation/passes/pass_manager.py:100] Skipping with compile range (205, 8192) +(Worker_TP3 pid=671) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP3 pid=671) DEBUG 04-23 00:28:04 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP3 pid=671) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(Worker_TP0 pid=668) DEBUG 04-23 00:28:05 [compilation/backends.py:377] Store the 48-th graph for compile range(205, 8192) from inductor_standalone via handle ('artifact_compile_range_205_8192_subgraph_48', '/data/.cache/vllm/torch_compile_cache/a31c82151b/rank_0_0/backbone/artifact_compile_range_205_8192_subgraph_48') +(Worker_TP0 pid=668) INFO 04-23 00:28:05 [compilation/backends.py:390] Compiling a graph for compile range (205, 8192) takes 12.01 s +(Worker_TP0 pid=668) DEBUG 04-23 00:28:05 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/a31c82151b/rank_0_0/backbone/computation_graph.py +(Worker_TP0 pid=668) INFO 04-23 00:28:07 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/3dd8de28feb215835436b71e8332dffd08ece5a27a87859d202824d9bdf14468/rank_0_0/model +(Worker_TP0 pid=668) INFO 04-23 00:28:07 [compilation/monitor.py:48] torch.compile took 29.69 s in total +(Worker_TP0 pid=668) WARNING 04-23 00:28:08 [model_executor/.../fused_moe/fused_moe.py:1090] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +(Worker_TP0 pid=668) INFO 04-23 00:28:09 [compilation/monitor.py:76] Initial profiling/warmup run took 2.14 s +(APIServer pid=1) DEBUG 04-23 00:28:11 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=668) INFO 04-23 00:28:15 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP0 pid=668) WARNING 04-23 00:28:15 [v1/worker/gpu_model_runner.py:6339] CUDAGraphMode.FULL_AND_PIECEWISE is not supported with ChunkedLocalAttention_8192_FlashAttentionBackend backend (support: AttentionCGSupport.NEVER); setting cudagraph_mode=PIECEWISE because attention is compiled piecewise +(Worker_TP0 pid=668) DEBUG 04-23 00:28:15 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP0 pid=668) INFO 04-23 00:28:15 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512) +(Worker_TP3 pid=671) INFO 04-23 00:28:15 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP3 pid=671) WARNING 04-23 00:28:15 [v1/worker/gpu_model_runner.py:6339] CUDAGraphMode.FULL_AND_PIECEWISE is not supported with ChunkedLocalAttention_8192_FlashAttentionBackend backend (support: AttentionCGSupport.NEVER); setting cudagraph_mode=PIECEWISE because attention is compiled piecewise +(Worker_TP1 pid=669) INFO 04-23 00:28:15 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP1 pid=669) WARNING 04-23 00:28:15 [v1/worker/gpu_model_runner.py:6339] CUDAGraphMode.FULL_AND_PIECEWISE is not supported with ChunkedLocalAttention_8192_FlashAttentionBackend backend (support: AttentionCGSupport.NEVER); setting cudagraph_mode=PIECEWISE because attention is compiled piecewise +(Worker_TP3 pid=671) DEBUG 04-23 00:28:15 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP3 pid=671) INFO 04-23 00:28:15 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512) +(Worker_TP1 pid=669) DEBUG 04-23 00:28:15 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP1 pid=669) INFO 04-23 00:28:15 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512) +(Worker_TP2 pid=670) INFO 04-23 00:28:15 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP2 pid=670) WARNING 04-23 00:28:15 [v1/worker/gpu_model_runner.py:6339] CUDAGraphMode.FULL_AND_PIECEWISE is not supported with ChunkedLocalAttention_8192_FlashAttentionBackend backend (support: AttentionCGSupport.NEVER); setting cudagraph_mode=PIECEWISE because attention is compiled piecewise +(Worker_TP2 pid=670) DEBUG 04-23 00:28:15 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP2 pid=670) INFO 04-23 00:28:15 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512) +(Worker_TP0 pid=668) DEBUG 04-23 00:28:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=669) DEBUG 04-23 00:28:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=671) DEBUG 04-23 00:28:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=670) DEBUG 04-23 00:28:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=668) DEBUG 04-23 00:28:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=668) DEBUG 04-23 00:28:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=668) DEBUG 04-23 00:28:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=669) DEBUG 04-23 00:28:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=669) DEBUG 04-23 00:28:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=668) DEBUG 04-23 00:28:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=668) DEBUG 04-23 00:28:16 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP3 pid=671) DEBUG 04-23 00:28:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=671) DEBUG 04-23 00:28:16 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=669) DEBUG 04-23 00:28:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=668) DEBUG 04-23 00:28:16 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 112.00 MiB first-capture + (51-1) × 20.00 MiB per-graph +(Worker_TP0 pid=668) INFO 04-23 00:28:16 [distributed/device_communicators/custom_all_reduce.py:216] Registering 192 cuda graph addresses +(Worker_TP3 pid=671) DEBUG 04-23 00:28:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=669) DEBUG 04-23 00:28:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=669) DEBUG 04-23 00:28:16 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP3 pid=671) DEBUG 04-23 00:28:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=671) DEBUG 04-23 00:28:16 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP2 pid=670) DEBUG 04-23 00:28:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=670) DEBUG 04-23 00:28:16 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=669) DEBUG 04-23 00:28:16 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 112.00 MiB first-capture + (51-1) × 20.00 MiB per-graph +(Worker_TP1 pid=669) INFO 04-23 00:28:16 [distributed/device_communicators/custom_all_reduce.py:216] Registering 192 cuda graph addresses +(Worker_TP3 pid=671) DEBUG 04-23 00:28:16 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 112.00 MiB first-capture + (51-1) × 20.00 MiB per-graph +(Worker_TP3 pid=671) INFO 04-23 00:28:16 [distributed/device_communicators/custom_all_reduce.py:216] Registering 192 cuda graph addresses +(Worker_TP2 pid=670) DEBUG 04-23 00:28:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=670) DEBUG 04-23 00:28:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP2 pid=670) DEBUG 04-23 00:28:16 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP2 pid=670) DEBUG 04-23 00:28:16 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 112.00 MiB first-capture + (51-1) × 20.00 MiB per-graph +(Worker_TP2 pid=670) INFO 04-23 00:28:16 [distributed/device_communicators/custom_all_reduce.py:216] Registering 192 cuda graph addresses +(Worker_TP0 pid=668) DEBUG 04-23 00:28:17 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP0 pid=668) INFO 04-23 00:28:17 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.09 GiB total +(Worker_TP3 pid=671) DEBUG 04-23 00:28:17 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP3 pid=671) INFO 04-23 00:28:17 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.09 GiB total +(Worker_TP1 pid=669) DEBUG 04-23 00:28:17 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP1 pid=669) INFO 04-23 00:28:17 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.09 GiB total +(Worker_TP2 pid=670) DEBUG 04-23 00:28:17 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP2 pid=670) INFO 04-23 00:28:17 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.09 GiB total +(Worker_TP3 pid=671) DEBUG 04-23 00:28:17 [v1/worker/gpu_worker.py:424] Initial free memory: 77.12 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP3 pid=671) DEBUG 04-23 00:28:17 [v1/worker/gpu_worker.py:430] Free memory after profiling: 21.22 GiB (total), 19.33 GiB (within requested) +(Worker_TP3 pid=671) DEBUG 04-23 00:28:17 [v1/worker/gpu_worker.py:435] Memory profiling takes 40.78 seconds. Total non KV cache memory: 58.48GiB; torch peak memory increase: 3.19GiB; non-torch forward increase memory: 2.17GiB; weights memory: 53.12GiB. +(Worker_TP3 pid=671) INFO 04-23 00:28:17 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9637 to maintain the same effective KV cache size. +(Worker_TP3 pid=671) DEBUG 04-23 00:28:17 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=668) DEBUG 04-23 00:28:17 [v1/worker/gpu_worker.py:424] Initial free memory: 77.12 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP0 pid=668) DEBUG 04-23 00:28:17 [v1/worker/gpu_worker.py:430] Free memory after profiling: 21.22 GiB (total), 19.33 GiB (within requested) +(Worker_TP0 pid=668) DEBUG 04-23 00:28:17 [v1/worker/gpu_worker.py:435] Memory profiling takes 40.70 seconds. Total non KV cache memory: 58.48GiB; torch peak memory increase: 3.19GiB; non-torch forward increase memory: 2.17GiB; weights memory: 53.12GiB. +(Worker_TP0 pid=668) INFO 04-23 00:28:17 [v1/worker/gpu_worker.py:436] Available KV cache memory: 16.75 GiB +(Worker_TP0 pid=668) INFO 04-23 00:28:17 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9637 to maintain the same effective KV cache size. +(Worker_TP0 pid=668) DEBUG 04-23 00:28:17 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=469) DEBUG 04-23 00:28:17 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=469) DEBUG 04-23 00:28:17 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=669) DEBUG 04-23 00:28:17 [v1/worker/gpu_worker.py:424] Initial free memory: 77.12 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP1 pid=669) DEBUG 04-23 00:28:17 [v1/worker/gpu_worker.py:430] Free memory after profiling: 21.22 GiB (total), 19.33 GiB (within requested) +(Worker_TP1 pid=669) DEBUG 04-23 00:28:17 [v1/worker/gpu_worker.py:435] Memory profiling takes 40.73 seconds. Total non KV cache memory: 58.48GiB; torch peak memory increase: 3.19GiB; non-torch forward increase memory: 2.17GiB; weights memory: 53.12GiB. +(Worker_TP1 pid=669) INFO 04-23 00:28:17 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9637 to maintain the same effective KV cache size. +(Worker_TP1 pid=669) DEBUG 04-23 00:28:17 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=469) DEBUG 04-23 00:28:17 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=469) DEBUG 04-23 00:28:17 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP2 pid=670) DEBUG 04-23 00:28:17 [v1/worker/gpu_worker.py:424] Initial free memory: 77.12 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP2 pid=670) DEBUG 04-23 00:28:17 [v1/worker/gpu_worker.py:430] Free memory after profiling: 21.22 GiB (total), 19.33 GiB (within requested) +(Worker_TP2 pid=670) DEBUG 04-23 00:28:17 [v1/worker/gpu_worker.py:435] Memory profiling takes 40.85 seconds. Total non KV cache memory: 58.48GiB; torch peak memory increase: 3.19GiB; non-torch forward increase memory: 2.17GiB; weights memory: 53.12GiB. +(Worker_TP2 pid=670) INFO 04-23 00:28:17 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9637 to maintain the same effective KV cache size. +(Worker_TP2 pid=670) DEBUG 04-23 00:28:17 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=469) DEBUG 04-23 00:28:17 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=469) INFO 04-23 00:28:17 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 365,968 tokens +(EngineCore pid=469) INFO 04-23 00:28:17 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 44.68x +(Worker_TP0 pid=668) DEBUG 04-23 00:28:17 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=669) DEBUG 04-23 00:28:17 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP3 pid=671) DEBUG 04-23 00:28:17 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP2 pid=670) DEBUG 04-23 00:28:17 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP2 pid=670) 2026-04-23 00:28:17,637 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP1 pid=669) 2026-04-23 00:28:17,637 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP3 pid=671) 2026-04-23 00:28:17,637 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP0 pid=668) 2026-04-23 00:28:17,637 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP2 pid=670) DEBUG 04-23 00:28:17 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=669) DEBUG 04-23 00:28:17 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP3 pid=671) DEBUG 04-23 00:28:17 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=668) DEBUG 04-23 00:28:17 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=668) 2026-04-23 00:28:17,696 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP1 pid=669) 2026-04-23 00:28:17,696 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP3 pid=671) 2026-04-23 00:28:17,696 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP2 pid=670) 2026-04-23 00:28:17,696 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP0 pid=668) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00: ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=469) DEBUG 04-23 00:28:33 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=469) DEBUG 04-23 00:28:39 [v1/engine/core.py:190] Batch queue is enabled with size 2 +(EngineCore pid=469) DEBUG 04-23 00:28:40 [utils/gc_utils.py:40] GC Debug Config. enabled:False,top_objects:-1 +(EngineCore pid=469) INFO 04-23 00:28:40 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-23 00:28:40 [v1/engine/utils.py:1158] READY from local core engine process 0. +(APIServer pid=1) DEBUG 04-23 00:28:40 [v1/metrics/loggers.py:273] Engine 000: vllm cache_config_info with initialization after num_gpu_blocks is: 91495 +(EngineCore pid=469) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=469) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=469) INFO 04-23 00:28:40 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(Worker_TP0 pid=668) DEBUG 04-23 00:28:40 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=669) DEBUG 04-23 00:28:40 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP2 pid=670) DEBUG 04-23 00:28:40 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP3 pid=671) DEBUG 04-23 00:28:40 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=469) DEBUG 04-23 00:28:40 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=469) DEBUG 04-23 00:28:40 [v1/engine/core.py:1158] EngineCore waiting for work. +(EngineCore pid=469) DEBUG 04-23 00:28:40 [v1/engine/core.py:1158] EngineCore waiting for work. +(APIServer pid=1) INFO 04-23 00:28:40 [entrypoints/openai/api_server.py:590] Supported tasks: ['generate'] +(APIServer pid=1) WARNING 04-23 00:28:40 [config/model.py:1435] Default vLLM sampling parameters have been overridden by the model's `generation_config.json`: `{'temperature': 0.6, 'top_p': 0.9}`. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`. +(APIServer pid=1) DEBUG 04-23 00:28:40 [renderers/base.py:197] Warming up chat template processing... +(Worker_TP1 pid=669) DEBUG 04-23 00:28:41 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP2 pid=670) DEBUG 04-23 00:28:41 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=668) DEBUG 04-23 00:28:41 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP3 pid=671) DEBUG 04-23 00:28:41 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(APIServer pid=1) INFO 04-23 00:28:42 [renderers/hf.py:314] Detected the chat template content format to be 'openai'. You can set `--chat-template-content-format` to override this. +(APIServer pid=1) DEBUG 04-23 00:28:42 [renderers/base.py:203] Chat template warmup completed in 2.133s +(APIServer pid=1) DEBUG 04-23 00:28:42 [renderers/base.py:218] Warming up multi-modal processing... +(APIServer pid=1) INFO 04-23 00:28:47 [renderers/base.py:231] Multi-modal warmup completed in 4.752s +(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/openai/api_server.py:594] Starting vLLM server on http://0.0.0.0:8000 +(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:37] Available routes are: +(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /openapi.json, Methods: GET, HEAD +(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /docs, Methods: GET, HEAD +(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /docs/oauth2-redirect, Methods: GET, HEAD +(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /redoc, Methods: GET, HEAD +(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /tokenize, Methods: POST +(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /detokenize, Methods: POST +(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /load, Methods: GET +(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /version, Methods: GET +(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /health, Methods: GET +(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /metrics, Methods: GET +(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /v1/models, Methods: GET +(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /ping, Methods: GET +(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /ping, Methods: POST +(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /invocations, Methods: POST +(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /v1/chat/completions, Methods: POST +(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST +(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /v1/responses, Methods: POST +(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET +(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST +(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /v1/completions, Methods: POST +(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /v1/messages, Methods: POST +(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST +(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /inference/v1/generate, Methods: POST +(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /scale_elastic_ep, Methods: POST +(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST +(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /v1/chat/completions/render, Methods: POST +(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /v1/completions/render, Methods: POST +(APIServer pid=1) INFO: Started server process [1] +(APIServer pid=1) INFO: Waiting for application startup. +(APIServer pid=1) INFO: Application startup complete. +(APIServer pid=1) DEBUG 04-23 00:28:49 [v1/engine/async_llm.py:875] Called check_health. +(APIServer pid=1) INFO: 10.128.10.2:42458 - "GET /health HTTP/1.1" 200 OK diff --git a/accuracy/results/v0.19.0/logs/microsoft-phi-2--h100-80gb--tp1pp1dp1--2048.log b/accuracy/results/v0.19.0/logs/microsoft-phi-2--h100-80gb--tp1pp1dp1--2048.log new file mode 100644 index 00000000..32d7d147 --- /dev/null +++ b/accuracy/results/v0.19.0/logs/microsoft-phi-2--h100-80gb--tp1pp1dp1--2048.log @@ -0,0 +1,769 @@ +DEBUG 04-22 20:00:23 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 20:00:23 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 20:00:23 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 20:00:23 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 20:00:23 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 20:00:23 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 20:00:23 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 20:00:23 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 20:00:23 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 20:00:23 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 20:00:23 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 20:00:23 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 20:00:28 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 20:00:30 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' +DEBUG 04-22 20:00:30 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 20:00:30 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 20:00:30 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 20:00:30 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 20:00:30 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 20:00:30 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 20:00:30 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 20:00:30 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model microsoft/phi-2 +(APIServer pid=1) INFO 04-22 20:00:30 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 20:00:30 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 20:00:30 [entrypoints/utils.py:233] non-default args: {'model_tag': 'microsoft/phi-2', 'model': 'microsoft/phi-2', 'max_model_len': 2048, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 20:00:30 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 20:00:30 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.phi.PhiForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 20:00:30 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0016507 secs +(APIServer pid=1) INFO 04-22 20:00:30 [config/model.py:549] Resolved architecture: PhiForCausalLM +(APIServer pid=1) INFO 04-22 20:00:30 [config/model.py:1678] Using max model len 2048 +(APIServer pid=1) DEBUG 04-22 20:00:30 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 20:00:30 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 20:00:30 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 20:00:30 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 20:00:30 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 20:00:30 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 20:00:30 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 20:00:30 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 20:00:30 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 20:00:31 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 20:00:31 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 20:00:35 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 20:00:35 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 20:00:35 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 20:00:35 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 20:00:35 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 20:00:35 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 20:00:35 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 20:00:35 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 20:00:35 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 20:00:35 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 20:00:35 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 20:00:35 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 20:00:40 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=245) DEBUG 04-22 20:00:41 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 20:00:41 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=245) DEBUG 04-22 20:00:41 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/eca9f98d-548f-4f72-b376-b1acbadfa08d'], outputs=['ipc:///tmp/c0a62889-c208-4c99-bf84-864ec7c7abaf'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=245) DEBUG 04-22 20:00:41 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=245) DEBUG 04-22 20:00:41 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=245) DEBUG 04-22 20:00:41 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=245) DEBUG 04-22 20:00:41 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=245) DEBUG 04-22 20:00:41 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=245) INFO 04-22 20:00:41 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='microsoft/phi-2', speculative_config=None, tokenizer='microsoft/phi-2', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=microsoft/phi-2, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=245) WARNING 04-22 20:00:42 [platforms/interface.py:525] Using 'pin_memory=False' as WSL is detected. This may slow down the performance. +(EngineCore pid=245) DEBUG 04-22 20:00:42 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.130.3.163:54515 backend=nccl +(EngineCore pid=245) INFO 04-22 20:00:42 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.130.3.163:54515 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=245) DEBUG 04-22 20:00:42 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=245) INFO 04-22 20:00:42 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=245) DEBUG 04-22 20:00:42 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776888042.7716968, auto_measure=True +(EngineCore pid=245) DEBUG 04-22 20:00:42 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=245) DEBUG 04-22 20:00:42 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=245) DEBUG 04-22 20:00:42 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=245) DEBUG 04-22 20:00:42 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=245) DEBUG 04-22 20:00:42 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=245) DEBUG 04-22 20:00:42 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=245) DEBUG 04-22 20:00:42 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=245) DEBUG 04-22 20:00:42 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=245) DEBUG 04-22 20:00:42 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=245) INFO 04-22 20:00:42 [v1/worker/gpu_model_runner.py:4735] Starting to load model microsoft/phi-2... +(EngineCore pid=245) DEBUG 04-22 20:00:43 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=80, dtype=torch.float16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {FLASHINFER: [head_size not supported]}. +(EngineCore pid=245) INFO 04-22 20:00:43 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=245) INFO 04-22 20:00:43 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=245) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=245) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=245) DEBUG 04-22 20:00:43 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=245) DEBUG 04-22 20:00:43 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=245) DEBUG 04-22 20:00:43 [config/compilation.py:1195] disabled custom ops: Counter({'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'gelu_new': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=245) DEBUG 04-22 20:00:43 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=245) DEBUG 04-22 20:00:43 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00001-of-00002.safetensors', 'model-00002-of-00002.safetensors']] +(APIServer pid=1) DEBUG 04-22 20:00:51 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 20:01:01 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 20:01:11 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 20:01:21 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-22 20:01:31 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=245) INFO 04-22 20:01:37 [model_executor/model_loader/weight_utils.py:581] Time spent downloading weights for microsoft/phi-2: 53.079146 seconds +(EngineCore pid=245) Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00 +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/normalization.py +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/phi.py +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=245) INFO 04-22 20:01:46 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/99e20e2350/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=610adf44e9 comp=e546579c48 code=72f26a8252bd2dc07110f02adcd5ec7b0a60ebd294b581ec9fa5989f3d1bc98e dir=/data/.cache/vllm/torch_compile_cache/99e20e2350/rank_0_0/backbone +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] Vllm config hash: 610adf44e9 +(EngineCore pid=245) INFO 04-22 20:01:46 [compilation/backends.py:1111] Dynamo bytecode transform time: 3.52 s +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=245) DEBUG 04-22 20:01:47 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=245) DEBUG 04-22 20:01:47 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(EngineCore pid=245) DEBUG 04-22 20:01:47 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms +(EngineCore pid=245) DEBUG 04-22 20:01:47 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=245) DEBUG 04-22 20:01:47 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=245) INFO 04-22 20:01:48 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=245) DEBUG 04-22 20:01:48 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/99e20e2350/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=245) DEBUG 04-22 20:01:49 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=245) DEBUG 04-22 20:01:49 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.8 ms +(EngineCore pid=245) DEBUG 04-22 20:01:49 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 7.0 ms +(EngineCore pid=245) DEBUG 04-22 20:01:49 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=245) DEBUG 04-22 20:01:49 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 1.2 ms +(EngineCore pid=245) DEBUG 04-22 20:01:50 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/99e20e2350/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=245) DEBUG 04-22 20:01:51 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=245) DEBUG 04-22 20:01:51 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms +(EngineCore pid=245) DEBUG 04-22 20:01:51 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms +(EngineCore pid=245) DEBUG 04-22 20:01:51 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=245) DEBUG 04-22 20:01:51 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(EngineCore pid=245) DEBUG 04-22 20:01:51 [compilation/backends.py:377] Store the 32-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_32', '/data/.cache/vllm/torch_compile_cache/99e20e2350/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_32') +(EngineCore pid=245) INFO 04-22 20:01:51 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.04 s +(EngineCore pid=245) DEBUG 04-22 20:01:51 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/99e20e2350/rank_0_0/backbone/computation_graph.py +(APIServer pid=1) DEBUG 04-22 20:01:51 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=245) INFO 04-22 20:01:52 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/072de5f68642eb70f697736336cf9589eebd286ea377b4e297ad05380d4998c3/rank_0_0/model +(EngineCore pid=245) INFO 04-22 20:01:52 [compilation/monitor.py:48] torch.compile took 10.19 s in total +(EngineCore pid=245) INFO 04-22 20:01:53 [compilation/monitor.py:76] Initial profiling/warmup run took 0.42 s +(EngineCore pid=245) INFO 04-22 20:01:58 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=245) DEBUG 04-22 20:01:58 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=245) INFO 04-22 20:01:58 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=245) DEBUG 04-22 20:01:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=245) DEBUG 04-22 20:01:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=245) DEBUG 04-22 20:01:58 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=245) DEBUG 04-22 20:01:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=245) DEBUG 04-22 20:01:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=245) DEBUG 04-22 20:01:58 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=245) DEBUG 04-22 20:01:58 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 100.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(EngineCore pid=245) DEBUG 04-22 20:01:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=245) DEBUG 04-22 20:01:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=245) DEBUG 04-22 20:01:58 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=245) DEBUG 04-22 20:01:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=245) DEBUG 04-22 20:01:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=245) DEBUG 04-22 20:01:58 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=245) DEBUG 04-22 20:01:58 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 166.00 MiB first-capture + (51-1) × 4.00 MiB per-graph +(EngineCore pid=245) DEBUG 04-22 20:01:59 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=245) INFO 04-22 20:01:59 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.55 GiB total +(EngineCore pid=245) DEBUG 04-22 20:01:59 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=245) DEBUG 04-22 20:01:59 [v1/worker/gpu_worker.py:430] Free memory after profiling: 73.04 GiB (total), 69.59 GiB (within requested) +(EngineCore pid=245) DEBUG 04-22 20:01:59 [v1/worker/gpu_worker.py:435] Memory profiling takes 17.32 seconds. Total non KV cache memory: 6.23GiB; torch peak memory increase: 0.79GiB; non-torch forward increase memory: 0.24GiB; weights memory: 5.19GiB. +(EngineCore pid=245) INFO 04-22 20:01:59 [v1/worker/gpu_worker.py:436] Available KV cache memory: 69.0 GiB +(EngineCore pid=245) INFO 04-22 20:01:59 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9570 to maintain the same effective KV cache size. +(EngineCore pid=245) INFO 04-22 20:01:59 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 226,112 tokens +(EngineCore pid=245) INFO 04-22 20:01:59 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 2,048 tokens per request: 110.41x +(EngineCore pid=245) 2026-04-22 20:01:59,942 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=245) DEBUG 04-22 20:01:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=245) 2026-04-22 20:01:59,954 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=245) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 19:55:02 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 19:55:02 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 19:55:02 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 19:55:02 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 19:55:02 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 19:55:02 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model microsoft/phi-2 +(APIServer pid=1) INFO 04-22 19:55:02 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 19:55:02 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 19:55:02 [entrypoints/utils.py:233] non-default args: {'model_tag': 'microsoft/phi-2', 'model': 'microsoft/phi-2', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 19:55:02 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 19:55:03 [model_executor/models/registry.py:774] Cached model info file for class vllm.model_executor.models.phi.PhiForCausalLM not found +(APIServer pid=1) DEBUG 04-22 19:55:03 [model_executor/models/registry.py:834] Cache model info for class vllm.model_executor.models.phi.PhiForCausalLM miss. Loading model instead. +(APIServer pid=1) DEBUG 04-22 19:55:14 [model_executor/models/registry.py:844] Loaded model info for class vllm.model_executor.models.phi.PhiForCausalLM +(APIServer pid=1) DEBUG 04-22 19:55:14 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 10.8903088 secs +(APIServer pid=1) INFO 04-22 19:55:14 [config/model.py:549] Resolved architecture: PhiForCausalLM +(APIServer pid=1) Traceback (most recent call last): +(APIServer pid=1) File "/usr/local/bin/vllm", line 10, in +(APIServer pid=1) sys.exit(main()) +(APIServer pid=1) ^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main +(APIServer pid=1) args.dispatch_function(args) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd +(APIServer pid=1) uvloop.run(run_server(args)) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run +(APIServer pid=1) return __asyncio.run( +(APIServer pid=1) ^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run +(APIServer pid=1) return runner.run(main) +(APIServer pid=1) ^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run +(APIServer pid=1) return self._loop.run_until_complete(task) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper +(APIServer pid=1) return await main +(APIServer pid=1) ^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server +(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker +(APIServer pid=1) async with build_async_engine_client( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ +(APIServer pid=1) return await anext(self.gen) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client +(APIServer pid=1) async with build_async_engine_client_from_engine_args( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ +(APIServer pid=1) return await anext(self.gen) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 124, in build_async_engine_client_from_engine_args +(APIServer pid=1) vllm_config = engine_args.create_engine_config(usage_context=usage_context) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/engine/arg_utils.py", line 1549, in create_engine_config +(APIServer pid=1) model_config = self.create_model_config() +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/engine/arg_utils.py", line 1398, in create_model_config +(APIServer pid=1) return ModelConfig( +(APIServer pid=1) ^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/pydantic/_internal/_dataclasses.py", line 121, in __init__ +(APIServer pid=1) s.__pydantic_validator__.validate_python(ArgsKwargs(args, kwargs), self_instance=s) +(APIServer pid=1) pydantic_core._pydantic_core.ValidationError: 1 validation error for ModelConfig +(APIServer pid=1) Value error, User-specified max_model_len (8192) is greater than the derived max_model_len (max_position_embeddings=2048.0 or model_max_length=None in model's config.json). To allow overriding this maximum, set the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN=1. VLLM_ALLOW_LONG_MAX_MODEL_LEN must be used with extreme caution. If the model uses relative position encoding (RoPE), positions exceeding derived_max_model_len lead to nan. If the model uses absolute position encoding, positions exceeding derived_max_model_len will cause a CUDA array out-of-bounds error. [type=value_error, input_value=ArgsKwargs((), {'model': ...nderer_num_workers': 1}), input_type=ArgsKwargs] +(APIServer pid=1) For further information visit https://errors.pydantic.dev/2.12/v/value_error diff --git a/accuracy/results/v0.19.0/logs/openai-gpt-oss-20b--h100-80gb--tp1pp1dp1--8192.FAILED.log b/accuracy/results/v0.19.0/logs/openai-gpt-oss-20b--h100-80gb--tp1pp1dp1--8192.FAILED.log new file mode 100644 index 00000000..0d44cb0c --- /dev/null +++ b/accuracy/results/v0.19.0/logs/openai-gpt-oss-20b--h100-80gb--tp1pp1dp1--8192.FAILED.log @@ -0,0 +1,1112 @@ +DEBUG 04-22 00:58:18 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:58:18 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:58:18 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:58:18 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:58:18 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:58:18 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:58:18 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:58:18 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:58:18 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:58:18 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:58:18 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:58:18 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:58:23 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 00:58:25 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' +DEBUG 04-22 00:58:25 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 00:58:25 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:58:25 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:58:25 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 00:58:25 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:58:25 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 00:58:25 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 00:58:25 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model openai/gpt-oss-20b +(APIServer pid=1) INFO 04-22 00:58:25 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 00:58:25 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:58:25 [entrypoints/utils.py:233] non-default args: {'model_tag': 'openai/gpt-oss-20b', 'model': 'openai/gpt-oss-20b', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 00:58:25 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 00:58:25 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.gpt_oss.GptOssForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 00:58:25 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0013268 secs +(APIServer pid=1) INFO 04-22 00:58:25 [config/model.py:549] Resolved architecture: GptOssForCausalLM +(APIServer pid=1) Parse safetensors files: 0%| | 0/3 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=248) DEBUG 04-22 00:58:37 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=248) DEBUG 04-22 00:58:37 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=248) INFO 04-22 00:58:37 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='openai/gpt-oss-20b', speculative_config=None, tokenizer='openai/gpt-oss-20b', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=mxfp4, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='openai_gptoss', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=openai/gpt-oss-20b, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 1024, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=248) DEBUG 04-22 00:58:38 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.7.34:47009 backend=nccl +(EngineCore pid=248) INFO 04-22 00:58:38 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.7.34:47009 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=248) DEBUG 04-22 00:58:38 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=248) INFO 04-22 00:58:38 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N/A +(EngineCore pid=248) DEBUG 04-22 00:58:38 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776819518.7695065, auto_measure=True +(EngineCore pid=248) DEBUG 04-22 00:58:38 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=248) DEBUG 04-22 00:58:38 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=248) DEBUG 04-22 00:58:38 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=248) DEBUG 04-22 00:58:38 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=248) DEBUG 04-22 00:58:38 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=248) DEBUG 04-22 00:58:38 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=248) DEBUG 04-22 00:58:38 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=248) DEBUG 04-22 00:58:38 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=248) DEBUG 04-22 00:58:38 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=248) INFO 04-22 00:58:38 [v1/worker/gpu_model_runner.py:4735] Starting to load model openai/gpt-oss-20b... +(EngineCore pid=248) DEBUG 04-22 00:58:39 [model_executor/.../quantization/mxfp4.py:75] MXFP4 linear layer is not implemented - falling back to UnquantizedLinearMethod. +(EngineCore pid=248) DEBUG 04-22 00:58:39 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=64, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=True, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {FLASHINFER: [attention sinks not supported], FLEX_ATTENTION: [attention sinks not supported]}. +(EngineCore pid=248) INFO 04-22 00:58:39 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'TRITON_ATTN']. +(EngineCore pid=248) INFO 04-22 00:58:39 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=248) DEBUG 04-22 00:58:39 [model_executor/.../quantization/mxfp4.py:84] MXFP4 attention layer is not implemented. Skipping quantization for this layer. +(EngineCore pid=248) DEBUG 04-22 00:58:39 [model_executor/.../oracle/mxfp4.py:355] Mxfp4 MoE backend 'FLASHINFER_TRTLLM_MXFP4_BF16' does not support the deployment configuration since kernel does not support current device cuda. +(EngineCore pid=248) DEBUG 04-22 00:58:39 [model_executor/.../oracle/mxfp4.py:355] Mxfp4 MoE backend 'CK' does not support the deployment configuration since kernel does not support current device cuda. +(EngineCore pid=248) INFO 04-22 00:58:39 [model_executor/.../oracle/mxfp4.py:352] Using 'TRITON' Mxfp4 MoE backend. +(EngineCore pid=248) DEBUG 04-22 00:58:39 [model_executor/.../runner/default_moe_runner.py:240] Enabled separate cuda stream for MoE shared_experts +(EngineCore pid=248) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=248) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=248) DEBUG 04-22 00:58:39 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=248) DEBUG 04-22 00:58:39 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=248) DEBUG 04-22 00:58:39 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 49, 'fused_moe': 24, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=248) DEBUG 04-22 00:58:39 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=248) DEBUG 04-22 00:58:39 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00000-of-00002.safetensors', 'model-00001-of-00002.safetensors', 'model-00002-of-00002.safetensors']] +(EngineCore pid=248) Loading safetensors checkpoint shards: 0% Completed | 0/3 [00:00 +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/forward_context.py +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/config.py +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/router/gate_linear.py +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/mxfp4.py +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/gpt_oss.py +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=248) INFO 04-22 00:58:57 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/529e03afeb/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=740e479b7e comp=e546579c48 code=0467e540cccd21fe4f6a70bb62bff632298390212396e8c7305d702bcfcff1ab dir=/data/.cache/vllm/torch_compile_cache/529e03afeb/rank_0_0/backbone +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] Vllm config hash: 740e479b7e +(EngineCore pid=248) INFO 04-22 00:58:57 [compilation/backends.py:1111] Dynamo bytecode transform time: 3.32 s +(APIServer pid=1) DEBUG 04-22 00:58:57 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=248) DEBUG 04-22 00:58:58 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=248) DEBUG 04-22 00:58:58 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=248) DEBUG 04-22 00:58:58 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=248) DEBUG 04-22 00:58:58 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(EngineCore pid=248) DEBUG 04-22 00:58:58 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.7 ms +(EngineCore pid=248) DEBUG 04-22 00:58:58 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=248) DEBUG 04-22 00:58:58 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=248) INFO 04-22 00:59:00 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=248) DEBUG 04-22 00:59:00 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/529e03afeb/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=248) DEBUG 04-22 00:59:01 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=248) DEBUG 04-22 00:59:01 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms +(EngineCore pid=248) DEBUG 04-22 00:59:01 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.0 ms +(EngineCore pid=248) DEBUG 04-22 00:59:01 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=248) DEBUG 04-22 00:59:01 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=248) DEBUG 04-22 00:59:02 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/529e03afeb/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=248) DEBUG 04-22 00:59:02 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=248) DEBUG 04-22 00:59:02 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(EngineCore pid=248) DEBUG 04-22 00:59:02 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.7 ms +(EngineCore pid=248) DEBUG 04-22 00:59:02 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=248) DEBUG 04-22 00:59:02 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(EngineCore pid=248) DEBUG 04-22 00:59:02 [compilation/backends.py:377] Store the 24-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_24', '/data/.cache/vllm/torch_compile_cache/529e03afeb/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_24') +(EngineCore pid=248) INFO 04-22 00:59:02 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 4.61 s +(EngineCore pid=248) DEBUG 04-22 00:59:02 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/529e03afeb/rank_0_0/backbone/computation_graph.py +(EngineCore pid=248) INFO 04-22 00:59:04 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/fa25af138620590d2fed8e4ac24c59bbe55ddebfc4568c415c2ad3cab596cd1a/rank_0_0/model +(EngineCore pid=248) INFO 04-22 00:59:04 [compilation/monitor.py:48] torch.compile took 9.54 s in total +(APIServer pid=1) DEBUG 04-22 00:59:07 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=248) INFO 04-22 00:59:08 [compilation/monitor.py:76] Initial profiling/warmup run took 4.77 s +(EngineCore pid=248) INFO 04-22 00:59:14 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=1024 +(EngineCore pid=248) DEBUG 04-22 00:59:14 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=248) INFO 04-22 00:59:14 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=83 (largest=1024), FULL=83 (largest=1024) +(EngineCore pid=248) DEBUG 04-22 00:59:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=248) DEBUG 04-22 00:59:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=248) DEBUG 04-22 00:59:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=1024, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=248) DEBUG 04-22 00:59:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=248) DEBUG 04-22 00:59:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=248) DEBUG 04-22 00:59:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=1008, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=248) DEBUG 04-22 00:59:15 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 186.00 MiB first-capture + (83-1) × 6.00 MiB per-graph +(EngineCore pid=248) DEBUG 04-22 00:59:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=248) DEBUG 04-22 00:59:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=248) DEBUG 04-22 00:59:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=1024, num_reqs=1024, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=248) DEBUG 04-22 00:59:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=248) DEBUG 04-22 00:59:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=248) DEBUG 04-22 00:59:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=1008, num_reqs=1008, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=248) DEBUG 04-22 00:59:15 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 6.00 MiB first-capture + (83-1) × 8.00 MiB per-graph +(EngineCore pid=248) DEBUG 04-22 00:59:16 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=248) INFO 04-22 00:59:16 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.30 GiB total +(EngineCore pid=248) DEBUG 04-22 00:59:16 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=248) DEBUG 04-22 00:59:16 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.77 GiB (total), 59.33 GiB (within requested) +(EngineCore pid=248) DEBUG 04-22 00:59:16 [v1/worker/gpu_worker.py:435] Memory profiling takes 21.74 seconds. Total non KV cache memory: 16.76GiB; torch peak memory increase: 2.87GiB; non-torch forward increase memory: 0.25GiB; weights memory: 13.64GiB. +(EngineCore pid=248) INFO 04-22 00:59:16 [v1/worker/gpu_worker.py:436] Available KV cache memory: 58.47 GiB +(EngineCore pid=248) INFO 04-22 00:59:16 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9665 to maintain the same effective KV cache size. +(EngineCore pid=248) INFO 04-22 00:59:16 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 1,277,184 tokens +(EngineCore pid=248) INFO 04-22 00:59:16 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 155.75x +(EngineCore pid=248) 2026-04-22 00:59:16,275 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=248) DEBUG 04-22 00:59:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=248) 2026-04-22 00:59:16,316 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=248) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/83 [00:00 +(APIServer pid=1) sys.exit(main()) +(APIServer pid=1) ^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main +(APIServer pid=1) args.dispatch_function(args) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd +(APIServer pid=1) uvloop.run(run_server(args)) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run +(APIServer pid=1) return __asyncio.run( +(APIServer pid=1) ^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run +(APIServer pid=1) return runner.run(main) +(APIServer pid=1) ^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run +(APIServer pid=1) return self._loop.run_until_complete(task) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper +(APIServer pid=1) return await main +(APIServer pid=1) ^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server +(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker +(APIServer pid=1) async with build_async_engine_client( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ +(APIServer pid=1) return await anext(self.gen) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client +(APIServer pid=1) async with build_async_engine_client_from_engine_args( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ +(APIServer pid=1) return await anext(self.gen) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 136, in build_async_engine_client_from_engine_args +(APIServer pid=1) async_llm = AsyncLLM.from_vllm_config( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 225, in from_vllm_config +(APIServer pid=1) return cls( +(APIServer pid=1) ^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 154, in __init__ +(APIServer pid=1) self.engine_core = EngineCoreClient.make_async_mp_client( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(APIServer pid=1) return func(*args, **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 130, in make_async_mp_client +(APIServer pid=1) return AsyncMPClient(*client_args) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(APIServer pid=1) return func(*args, **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 887, in __init__ +(APIServer pid=1) super().__init__( +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 535, in __init__ +(APIServer pid=1) with launch_core_engines( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 144, in __exit__ +(APIServer pid=1) next(self.gen) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 998, in launch_core_engines +(APIServer pid=1) wait_for_engine_startup( +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 1057, in wait_for_engine_startup +(APIServer pid=1) raise RuntimeError( +(APIServer pid=1) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} diff --git a/accuracy/results/v0.19.0/logs/openai-gpt-oss-20b--h100-80gb--tp2pp1dp1--8192.FAILED.log b/accuracy/results/v0.19.0/logs/openai-gpt-oss-20b--h100-80gb--tp2pp1dp1--8192.FAILED.log new file mode 100644 index 00000000..6ee6d6d3 --- /dev/null +++ b/accuracy/results/v0.19.0/logs/openai-gpt-oss-20b--h100-80gb--tp2pp1dp1--8192.FAILED.log @@ -0,0 +1,2104 @@ +DEBUG 04-22 19:52:48 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 19:52:48 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 19:52:48 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 19:52:48 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 19:52:48 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 19:52:48 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 19:52:48 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 19:52:48 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 19:52:48 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 19:52:48 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 19:52:48 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 19:52:48 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 19:52:52 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 19:52:54 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' +DEBUG 04-22 19:52:54 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 19:52:54 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 19:52:54 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 19:52:54 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 19:52:54 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 19:52:54 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 19:52:54 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 19:52:54 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model openai/gpt-oss-20b +(APIServer pid=1) INFO 04-22 19:52:54 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 19:52:54 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 19:52:54 [entrypoints/utils.py:233] non-default args: {'model_tag': 'openai/gpt-oss-20b', 'model': 'openai/gpt-oss-20b', 'max_model_len': 8192, 'tensor_parallel_size': 2, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 19:52:54 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 19:52:55 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.gpt_oss.GptOssForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 19:52:55 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0027820 secs +(APIServer pid=1) INFO 04-22 19:52:55 [config/model.py:549] Resolved architecture: GptOssForCausalLM +(APIServer pid=1) Parse safetensors files: 0%| | 0/3 [00:00:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(APIServer pid=1) DEBUG 04-22 19:52:57 [config/vllm.py:1530] Max num batched tokens below allreduce-rms fusion threshold, allreduce-rms fusion will be enabled for all num_tokens. +(APIServer pid=1) INFO 04-22 19:52:57 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(APIServer pid=1) DEBUG 04-22 19:52:57 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 19:52:57 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 19:52:58 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 19:52:58 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 19:53:01 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 19:53:01 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 19:53:01 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 19:53:01 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 19:53:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 19:53:01 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 19:53:01 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 19:53:01 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 19:53:01 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 19:53:01 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 19:53:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 19:53:01 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 19:53:06 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(APIServer pid=1) DEBUG 04-22 19:53:08 [v1/engine/utils.py:1042] Waiting for 1 local, 0 remote core engine proc(s) to connect. +(EngineCore pid=250) DEBUG 04-22 19:53:08 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 19:53:08 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=250) DEBUG 04-22 19:53:08 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/78b56809-ee37-459c-aba1-e0f72246333f'], outputs=['ipc:///tmp/9f7c989d-ac51-4563-bcdd-5b5a4d6ac892'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=250) DEBUG 04-22 19:53:08 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=250) DEBUG 04-22 19:53:08 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=250) DEBUG 04-22 19:53:08 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=250) DEBUG 04-22 19:53:08 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=250) DEBUG 04-22 19:53:08 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=250) INFO 04-22 19:53:08 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='openai/gpt-oss-20b', speculative_config=None, tokenizer='openai/gpt-oss-20b', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=mxfp4, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='openai_gptoss', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=openai/gpt-oss-20b, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 1024, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=250) WARNING 04-22 19:53:08 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. +(EngineCore pid=250) INFO 04-22 19:53:08 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.130.3.158 (local), world_size=2, local_world_size=2 +(EngineCore pid=250) DEBUG 04-22 19:53:08 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/f9dc1f06-7f86-4f82-ac37-a14d48e38fc0 +(EngineCore pid=250) DEBUG 04-22 19:53:08 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1], buffer_handle=(2, 16777216, 10, 'psm_da7bfba3'), local_subscribe_addr='ipc:///tmp/f9dc1f06-7f86-4f82-ac37-a14d48e38fc0', local_notify_addr='ipc:///tmp/f2478e9c-eaaa-4243-b0e4-a20bc65277fc', remote_subscribe_addr=None, remote_addr_ipv6=False) +DEBUG 04-22 19:53:11 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 19:53:11 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 19:53:11 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 19:53:11 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 19:53:11 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 19:53:11 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 19:53:11 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 19:53:11 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 19:53:11 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 19:53:11 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 19:53:11 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 19:53:11 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 19:53:12 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 19:53:12 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 19:53:12 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 19:53:12 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 19:53:12 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 19:53:12 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 19:53:12 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 19:53:12 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 19:53:12 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 19:53:12 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 19:53:12 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 19:53:12 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 19:53:16 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 19:53:16 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 19:53:18 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 19:53:18 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 19:53:18 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 19:53:18 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 19:53:18 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 19:53:18 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 19:53:18 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 19:53:18 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) DEBUG 04-22 19:53:18 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker pid=449) DEBUG 04-22 19:53:18 [distributed/parallel_state.py:1356] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:42747 backend=nccl +(Worker pid=449) INFO 04-22 19:53:18 [distributed/parallel_state.py:1400] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:42747 backend=nccl +(Worker pid=450) DEBUG 04-22 19:53:18 [distributed/parallel_state.py:1356] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:42747 backend=nccl +(Worker pid=450) INFO 04-22 19:53:18 [distributed/parallel_state.py:1400] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:42747 backend=nccl +[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +(Worker pid=450) DEBUG 04-22 19:53:18 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=449) DEBUG 04-22 19:53:18 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +(Worker pid=450) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=450) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=449) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=449) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=449) DEBUG 04-22 19:53:19 [utils/nccl.py:34] Found nccl from library libnccl.so.2 +(Worker pid=449) INFO 04-22 19:53:19 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 +(Worker pid=450) DEBUG 04-22 19:53:19 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=449) DEBUG 04-22 19:53:19 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=449) DEBUG 04-22 19:53:19 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/77daf599-eb59-429d-a2a7-22371debd17a +(Worker pid=449) DEBUG 04-22 19:53:19 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1], buffer_handle=(1, 4194304, 6, 'psm_f1f3ea6b'), local_subscribe_addr='ipc:///tmp/77daf599-eb59-429d-a2a7-22371debd17a', local_notify_addr='ipc:///tmp/a35154c1-e83c-418a-8993-92e9baeaa6ef', remote_subscribe_addr=None, remote_addr_ipv6=False) +(Worker pid=450) DEBUG 04-22 19:53:19 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/77daf599-eb59-429d-a2a7-22371debd17a +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +(Worker pid=449) INFO 04-22 19:53:20 [distributed/parallel_state.py:1716] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N/A +(Worker pid=450) DEBUG 04-22 19:53:20 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.18GiB, total_memory=79.19GiB, cuda_memory=2.01GiB, torch_memory=0.02GiB, non_torch_memory=1.99GiB, timestamp=1776887600.382866, auto_measure=True +(Worker pid=450) DEBUG 04-22 19:53:20 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=449) DEBUG 04-22 19:53:20 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.18GiB, total_memory=79.19GiB, cuda_memory=2.01GiB, torch_memory=0.02GiB, non_torch_memory=1.99GiB, timestamp=1776887600.4114015, auto_measure=True +(Worker pid=449) DEBUG 04-22 19:53:20 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=450) DEBUG 04-22 19:53:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=450) DEBUG 04-22 19:53:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=450) DEBUG 04-22 19:53:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=449) DEBUG 04-22 19:53:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=449) DEBUG 04-22 19:53:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=449) DEBUG 04-22 19:53:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=450) DEBUG 04-22 19:53:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=450) DEBUG 04-22 19:53:20 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=449) DEBUG 04-22 19:53:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=449) DEBUG 04-22 19:53:20 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(Worker pid=449) DEBUG 04-22 19:53:20 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=449) DEBUG 04-22 19:53:20 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(Worker pid=449) DEBUG 04-22 19:53:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker_TP0 pid=449) INFO 04-22 19:53:20 [v1/worker/gpu_model_runner.py:4735] Starting to load model openai/gpt-oss-20b... +(Worker_TP1 pid=450) DEBUG 04-22 19:53:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker_TP0 pid=449) DEBUG 04-22 19:53:20 [model_executor/.../quantization/mxfp4.py:75] MXFP4 linear layer is not implemented - falling back to UnquantizedLinearMethod. +(Worker_TP0 pid=449) DEBUG 04-22 19:53:20 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=64, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=True, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {FLASHINFER: [attention sinks not supported], FLEX_ATTENTION: [attention sinks not supported]}. +(Worker_TP0 pid=449) INFO 04-22 19:53:20 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'TRITON_ATTN']. +(Worker_TP0 pid=449) INFO 04-22 19:53:20 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(Worker_TP0 pid=449) DEBUG 04-22 19:53:20 [model_executor/.../quantization/mxfp4.py:84] MXFP4 attention layer is not implemented. Skipping quantization for this layer. +(Worker_TP0 pid=449) DEBUG 04-22 19:53:20 [model_executor/.../oracle/mxfp4.py:355] Mxfp4 MoE backend 'FLASHINFER_TRTLLM_MXFP4_BF16' does not support the deployment configuration since kernel does not support current device cuda. +(Worker_TP0 pid=449) DEBUG 04-22 19:53:20 [model_executor/.../oracle/mxfp4.py:355] Mxfp4 MoE backend 'CK' does not support the deployment configuration since kernel does not support current device cuda. +(Worker_TP0 pid=449) INFO 04-22 19:53:20 [model_executor/.../oracle/mxfp4.py:352] Using 'TRITON' Mxfp4 MoE backend. +(Worker_TP0 pid=449) DEBUG 04-22 19:53:20 [model_executor/.../runner/default_moe_runner.py:240] Enabled separate cuda stream for MoE shared_experts +(Worker_TP1 pid=450) DEBUG 04-22 19:53:20 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP1 pid=450) DEBUG 04-22 19:53:21 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP1 pid=450) DEBUG 04-22 19:53:21 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 49, 'fused_moe': 24, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP1 pid=450) DEBUG 04-22 19:53:21 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP0 pid=449) DEBUG 04-22 19:53:21 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP0 pid=449) DEBUG 04-22 19:53:21 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP0 pid=449) DEBUG 04-22 19:53:21 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 49, 'fused_moe': 24, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP0 pid=449) DEBUG 04-22 19:53:21 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP0 pid=449) DEBUG 04-22 19:53:21 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00001-of-00002.safetensors', 'model-00002-of-00002.safetensors', 'model-00000-of-00002.safetensors']] +(Worker_TP1 pid=450) DEBUG 04-22 19:53:21 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00000-of-00002.safetensors', 'model-00001-of-00002.safetensors', 'model-00002-of-00002.safetensors']] +(Worker_TP0 pid=449) Loading safetensors checkpoint shards: 0% Completed | 0/3 [00:00 +(Worker_TP1 pid=450) DEBUG 04-22 19:53:34 [compilation/decorators.py:528] Start compiling function +(EngineCore pid=250) DEBUG 04-22 19:53:34 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/forward_context.py +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/config.py +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/modular_kernel.py +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/prepare_finalize/no_dp_ep.py +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/router/gate_linear.py +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/mxfp4.py +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/gpt_oss.py +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=98b6f8a08d comp=e546579c48 code=3e1e8df059eb2a59571f3775ec43375257349289804d84b696e7e8ef247e8675 dir=/data/.cache/vllm/torch_compile_cache/c31ab6d213/rank_1_0/backbone +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] Vllm config hash: 98b6f8a08d +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/forward_context.py +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/config.py +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/modular_kernel.py +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/prepare_finalize/no_dp_ep.py +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/router/gate_linear.py +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/mxfp4.py +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/gpt_oss.py +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP0 pid=449) INFO 04-22 19:53:37 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/c31ab6d213/rank_0_0/backbone for vLLM's torch.compile +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=98b6f8a08d comp=e546579c48 code=3e1e8df059eb2a59571f3775ec43375257349289804d84b696e7e8ef247e8675 dir=/data/.cache/vllm/torch_compile_cache/c31ab6d213/rank_0_0/backbone +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] Vllm config hash: 98b6f8a08d +(Worker_TP0 pid=449) INFO 04-22 19:53:37 [compilation/backends.py:1111] Dynamo bytecode transform time: 3.80 s +(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/.../fusion/allreduce_rms_fusion.py:765] Flashinfer max size: 64 MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: 8192 +(Worker_TP0 pid=449) INFO 04-22 19:53:37 [distributed/device_communicators/flashinfer_all_reduce.py:109] Auto-selected flashinfer allreduce backend: trtllm +(Worker_TP0 pid=449) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. +(Worker_TP0 pid=449) return func(*args, **kwargs) +(Worker_TP0 pid=449) DEBUG 04-22 19:53:38 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=0, max_token_num=8192, hidden_dim=2880, dtype=torch.bfloat16 +(Worker_TP1 pid=450) DEBUG 04-22 19:53:38 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=1, max_token_num=8192, hidden_dim=2880, dtype=torch.bfloat16 +(Worker_TP0 pid=449) INFO 04-22 19:53:38 [distributed/device_communicators/flashinfer_all_reduce.py:149] Initialized FlashInfer Allreduce norm fusion workspace with backend=trtllm +(APIServer pid=1) DEBUG 04-22 19:53:38 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=449) DEBUG 04-22 19:53:38 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(Worker_TP0 pid=449) DEBUG 04-22 19:53:38 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(Worker_TP1 pid=450) DEBUG 04-22 19:53:39 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=450) DEBUG 04-22 19:53:39 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP0 pid=449) DEBUG 04-22 19:53:39 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=449) DEBUG 04-22 19:53:39 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP1 pid=450) DEBUG 04-22 19:53:39 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns +(Worker_TP1 pid=450) DEBUG 04-22 19:53:39 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 24.6 ms +(Worker_TP1 pid=450) DEBUG 04-22 19:53:39 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=450) DEBUG 04-22 19:53:39 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes +(Worker_TP1 pid=450) DEBUG 04-22 19:53:39 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=449) DEBUG 04-22 19:53:39 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns +(Worker_TP0 pid=449) DEBUG 04-22 19:53:39 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 23.6 ms +(Worker_TP0 pid=449) DEBUG 04-22 19:53:39 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=449) DEBUG 04-22 19:53:39 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes +(Worker_TP0 pid=449) DEBUG 04-22 19:53:39 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=449) INFO 04-22 19:53:41 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(Worker_TP0 pid=449) DEBUG 04-22 19:53:41 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/c31ab6d213/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(Worker_TP1 pid=450) DEBUG 04-22 19:53:41 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=450) DEBUG 04-22 19:53:41 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP0 pid=449) DEBUG 04-22 19:53:41 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=449) DEBUG 04-22 19:53:41 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP1 pid=450) DEBUG 04-22 19:53:41 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP1 pid=450) DEBUG 04-22 19:53:41 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 38.7 ms +(Worker_TP1 pid=450) DEBUG 04-22 19:53:41 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=450) DEBUG 04-22 19:53:41 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP1 pid=450) DEBUG 04-22 19:53:41 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP0 pid=449) DEBUG 04-22 19:53:42 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP0 pid=449) DEBUG 04-22 19:53:42 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.0 ms +(Worker_TP0 pid=449) DEBUG 04-22 19:53:42 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=449) DEBUG 04-22 19:53:42 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP0 pid=449) DEBUG 04-22 19:53:42 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP0 pid=449) DEBUG 04-22 19:53:42 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/c31ab6d213/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(Worker_TP1 pid=450) DEBUG 04-22 19:53:43 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=450) DEBUG 04-22 19:53:43 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP1 pid=450) DEBUG 04-22 19:53:43 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP1 pid=450) DEBUG 04-22 19:53:43 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.4 ms +(Worker_TP1 pid=450) DEBUG 04-22 19:53:43 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms +(Worker_TP1 pid=450) DEBUG 04-22 19:53:43 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP1 pid=450) DEBUG 04-22 19:53:43 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=449) DEBUG 04-22 19:53:43 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=449) DEBUG 04-22 19:53:43 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(Worker_TP0 pid=449) DEBUG 04-22 19:53:43 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP0 pid=449) DEBUG 04-22 19:53:43 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 37.5 ms +(Worker_TP0 pid=449) DEBUG 04-22 19:53:43 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms +(Worker_TP0 pid=449) DEBUG 04-22 19:53:43 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP0 pid=449) DEBUG 04-22 19:53:43 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP0 pid=449) DEBUG 04-22 19:53:44 [compilation/backends.py:377] Store the 24-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_24', '/data/.cache/vllm/torch_compile_cache/c31ab6d213/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_24') +(Worker_TP0 pid=449) INFO 04-22 19:53:44 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.52 s +(Worker_TP0 pid=449) DEBUG 04-22 19:53:44 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/c31ab6d213/rank_0_0/backbone/computation_graph.py +(Worker_TP0 pid=449) INFO 04-22 19:53:44 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/12de341f13df4ddc331114846a3ecb4479ec6abe2ec0ad2d0a2165f5ac81498e/rank_0_0/model +(Worker_TP0 pid=449) INFO 04-22 19:53:44 [compilation/monitor.py:48] torch.compile took 10.59 s in total +(APIServer pid=1) DEBUG 04-22 19:53:48 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=449) INFO 04-22 19:53:49 [compilation/monitor.py:76] Initial profiling/warmup run took 4.67 s +(Worker_TP1 pid=450) INFO 04-22 19:53:54 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=1024 +(Worker_TP1 pid=450) DEBUG 04-22 19:53:54 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP1 pid=450) INFO 04-22 19:53:54 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=83 (largest=1024), FULL=83 (largest=1024) +(Worker_TP0 pid=449) INFO 04-22 19:53:55 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=1024 +(Worker_TP0 pid=449) DEBUG 04-22 19:53:55 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP0 pid=449) INFO 04-22 19:53:55 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=83 (largest=1024), FULL=83 (largest=1024) +(Worker_TP1 pid=450) DEBUG 04-22 19:53:55 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=449) DEBUG 04-22 19:53:55 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=449) DEBUG 04-22 19:53:55 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=449) DEBUG 04-22 19:53:55 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=1024, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=450) DEBUG 04-22 19:53:55 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=450) DEBUG 04-22 19:53:55 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=1024, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=449) DEBUG 04-22 19:53:55 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=450) DEBUG 04-22 19:53:55 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=449) DEBUG 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=449) DEBUG 04-22 19:53:56 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=1008, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=450) DEBUG 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=450) DEBUG 04-22 19:53:56 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=1008, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=449) DEBUG 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 128.00 MiB first-capture + (83-1) × 6.00 MiB per-graph +(Worker_TP0 pid=449) DEBUG 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=450) DEBUG 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 128.00 MiB first-capture + (83-1) × 6.00 MiB per-graph +(Worker_TP1 pid=450) DEBUG 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=449) DEBUG 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=449) DEBUG 04-22 19:53:56 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=1024, num_reqs=1024, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=450) DEBUG 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=450) DEBUG 04-22 19:53:56 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=1024, num_reqs=1024, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=449) DEBUG 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=450) DEBUG 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=449) DEBUG 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=449) DEBUG 04-22 19:53:56 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=1008, num_reqs=1008, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=450) DEBUG 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=450) DEBUG 04-22 19:53:56 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=1008, num_reqs=1008, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=449) DEBUG 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 8.00 MiB first-capture + (83-1) × 6.00 MiB per-graph +(Worker_TP0 pid=449) INFO 04-22 19:53:56 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses +(Worker_TP1 pid=450) DEBUG 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 8.00 MiB first-capture + (83-1) × 6.00 MiB per-graph +(Worker_TP1 pid=450) INFO 04-22 19:53:56 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses +(Worker_TP0 pid=449) DEBUG 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP0 pid=449) INFO 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.09 GiB total +(Worker_TP1 pid=450) DEBUG 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP1 pid=450) INFO 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.09 GiB total +(Worker_TP0 pid=449) DEBUG 04-22 19:53:57 [v1/worker/gpu_worker.py:424] Initial free memory: 77.18 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP0 pid=449) DEBUG 04-22 19:53:57 [v1/worker/gpu_worker.py:430] Free memory after profiling: 66.27 GiB (total), 64.32 GiB (within requested) +(Worker_TP0 pid=449) DEBUG 04-22 19:53:57 [v1/worker/gpu_worker.py:435] Memory profiling takes 23.01 seconds. Total non KV cache memory: 11.95GiB; torch peak memory increase: 2.87GiB; non-torch forward increase memory: 2.07GiB; weights memory: 7.01GiB. +(Worker_TP0 pid=449) INFO 04-22 19:53:57 [v1/worker/gpu_worker.py:436] Available KV cache memory: 63.28 GiB +(Worker_TP0 pid=449) INFO 04-22 19:53:57 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9637 to maintain the same effective KV cache size. +(Worker_TP0 pid=449) DEBUG 04-22 19:53:57 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=250) DEBUG 04-22 19:53:57 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=250) DEBUG 04-22 19:53:57 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=450) DEBUG 04-22 19:53:57 [v1/worker/gpu_worker.py:424] Initial free memory: 77.18 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP1 pid=450) DEBUG 04-22 19:53:57 [v1/worker/gpu_worker.py:430] Free memory after profiling: 66.27 GiB (total), 64.32 GiB (within requested) +(Worker_TP1 pid=450) DEBUG 04-22 19:53:57 [v1/worker/gpu_worker.py:435] Memory profiling takes 23.06 seconds. Total non KV cache memory: 11.95GiB; torch peak memory increase: 2.87GiB; non-torch forward increase memory: 2.07GiB; weights memory: 7.01GiB. +(Worker_TP1 pid=450) INFO 04-22 19:53:57 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9637 to maintain the same effective KV cache size. +(Worker_TP1 pid=450) DEBUG 04-22 19:53:57 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=250) DEBUG 04-22 19:53:57 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=250) INFO 04-22 19:53:57 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 2,764,768 tokens +(EngineCore pid=250) INFO 04-22 19:53:57 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 337.17x +(Worker_TP0 pid=449) DEBUG 04-22 19:53:57 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=450) DEBUG 04-22 19:53:57 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=450) 2026-04-22 19:53:57,277 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP0 pid=449) 2026-04-22 19:53:57,277 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP0 pid=449) DEBUG 04-22 19:53:57 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=450) DEBUG 04-22 19:53:57 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=450) 2026-04-22 19:53:57,319 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP0 pid=449) 2026-04-22 19:53:57,320 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP0 pid=449) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/83 [00:00 +(APIServer pid=1) sys.exit(main()) +(APIServer pid=1) ^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main +(APIServer pid=1) args.dispatch_function(args) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd +(APIServer pid=1) uvloop.run(run_server(args)) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run +(APIServer pid=1) return __asyncio.run( +(APIServer pid=1) ^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run +(APIServer pid=1) return runner.run(main) +(APIServer pid=1) ^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run +(APIServer pid=1) return self._loop.run_until_complete(task) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper +(APIServer pid=1) return await main +(APIServer pid=1) ^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server +(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker +(APIServer pid=1) async with build_async_engine_client( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ +(APIServer pid=1) return await anext(self.gen) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client +(APIServer pid=1) async with build_async_engine_client_from_engine_args( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ +(APIServer pid=1) return await anext(self.gen) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 136, in build_async_engine_client_from_engine_args +(APIServer pid=1) async_llm = AsyncLLM.from_vllm_config( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 225, in from_vllm_config +(APIServer pid=1) return cls( +(APIServer pid=1) ^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 154, in __init__ +(APIServer pid=1) self.engine_core = EngineCoreClient.make_async_mp_client( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(APIServer pid=1) return func(*args, **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 130, in make_async_mp_client +(APIServer pid=1) return AsyncMPClient(*client_args) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper +(APIServer pid=1) return func(*args, **kwargs) +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 887, in __init__ +(APIServer pid=1) super().__init__( +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 535, in __init__ +(APIServer pid=1) with launch_core_engines( +(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^ +(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 144, in __exit__ +(APIServer pid=1) next(self.gen) +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 998, in launch_core_engines +(APIServer pid=1) wait_for_engine_startup( +(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 1057, in wait_for_engine_startup +(APIServer pid=1) raise RuntimeError( +(APIServer pid=1) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} +/usr/lib/python3.12/multiprocessing/resource_tracker.py:279: UserWarning: resource_tracker: There appear to be 2 leaked shared_memory objects to clean up at shutdown + warnings.warn('resource_tracker: There appear to be %d ' diff --git a/accuracy/results/v0.19.0/logs/openai-gpt-oss-20b--h100-80gb--tp2pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/openai-gpt-oss-20b--h100-80gb--tp2pp1dp1--8192.log new file mode 100644 index 00000000..c1d2580f --- /dev/null +++ b/accuracy/results/v0.19.0/logs/openai-gpt-oss-20b--h100-80gb--tp2pp1dp1--8192.log @@ -0,0 +1,1855 @@ +DEBUG 04-22 19:59:01 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 19:59:01 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 19:59:01 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 19:59:01 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 19:59:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 19:59:01 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 19:59:01 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 19:59:01 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 19:59:01 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 19:59:01 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 19:59:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 19:59:01 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 19:59:06 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 19:59:08 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' +DEBUG 04-22 19:59:08 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 19:59:08 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 19:59:08 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 19:59:08 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 19:59:08 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 19:59:08 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 19:59:08 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 19:59:08 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model openai/gpt-oss-20b +(APIServer pid=1) INFO 04-22 19:59:08 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 19:59:08 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 19:59:08 [entrypoints/utils.py:233] non-default args: {'model_tag': 'openai/gpt-oss-20b', 'model': 'openai/gpt-oss-20b', 'max_model_len': 8192, 'tensor_parallel_size': 2, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 19:59:08 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 19:59:08 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.gpt_oss.GptOssForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 19:59:08 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0032642 secs +(APIServer pid=1) INFO 04-22 19:59:08 [config/model.py:549] Resolved architecture: GptOssForCausalLM +(APIServer pid=1) Parse safetensors files: 0%| | 0/3 [00:00:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(APIServer pid=1) DEBUG 04-22 19:59:10 [config/vllm.py:1530] Max num batched tokens below allreduce-rms fusion threshold, allreduce-rms fusion will be enabled for all num_tokens. +(APIServer pid=1) INFO 04-22 19:59:10 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(APIServer pid=1) DEBUG 04-22 19:59:10 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 19:59:10 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 19:59:11 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 19:59:11 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 19:59:15 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 19:59:15 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 19:59:15 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 19:59:15 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 19:59:15 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 19:59:15 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 19:59:15 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 19:59:15 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 19:59:15 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 19:59:15 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 19:59:15 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 19:59:15 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 19:59:19 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=245) DEBUG 04-22 19:59:21 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 19:59:21 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=245) DEBUG 04-22 19:59:21 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/d3d31af3-ecd1-47c6-9854-1df8e53c0ac1'], outputs=['ipc:///tmp/4eaa8040-fdd7-47d7-a5e1-3383bf7c5ce1'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=245) DEBUG 04-22 19:59:21 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=245) DEBUG 04-22 19:59:21 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=245) DEBUG 04-22 19:59:21 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=245) DEBUG 04-22 19:59:21 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=245) DEBUG 04-22 19:59:21 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=245) INFO 04-22 19:59:21 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='openai/gpt-oss-20b', speculative_config=None, tokenizer='openai/gpt-oss-20b', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=mxfp4, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='openai_gptoss', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=openai/gpt-oss-20b, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 1024, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=245) WARNING 04-22 19:59:21 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. +(EngineCore pid=245) INFO 04-22 19:59:21 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.128.11.14 (local), world_size=2, local_world_size=2 +(EngineCore pid=245) DEBUG 04-22 19:59:21 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/a2eb9f62-781b-4250-9ef5-a396a73de42c +(EngineCore pid=245) DEBUG 04-22 19:59:21 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1], buffer_handle=(2, 16777216, 10, 'psm_800509af'), local_subscribe_addr='ipc:///tmp/a2eb9f62-781b-4250-9ef5-a396a73de42c', local_notify_addr='ipc:///tmp/ab131fb1-009e-482c-93dd-b205df334824', remote_subscribe_addr=None, remote_addr_ipv6=False) +DEBUG 04-22 19:59:24 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 19:59:24 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 19:59:24 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 19:59:24 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 19:59:24 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 19:59:24 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 19:59:24 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 19:59:24 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 19:59:24 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 19:59:24 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 19:59:24 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 19:59:24 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 19:59:24 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 19:59:24 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 19:59:24 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 19:59:24 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 19:59:24 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 19:59:24 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 19:59:24 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 19:59:24 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 19:59:24 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 19:59:24 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 19:59:24 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 19:59:24 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 19:59:29 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 19:59:29 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 19:59:30 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 19:59:30 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 19:59:30 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 19:59:30 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 19:59:30 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 19:59:30 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 19:59:30 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 19:59:30 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(Worker pid=444) DEBUG 04-22 19:59:31 [distributed/parallel_state.py:1356] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:34811 backend=nccl +(Worker pid=444) INFO 04-22 19:59:31 [distributed/parallel_state.py:1400] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:34811 backend=nccl +(APIServer pid=1) DEBUG 04-22 19:59:31 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker pid=445) DEBUG 04-22 19:59:31 [distributed/parallel_state.py:1356] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:34811 backend=nccl +(Worker pid=445) INFO 04-22 19:59:31 [distributed/parallel_state.py:1400] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:34811 backend=nccl +[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +(Worker pid=444) DEBUG 04-22 19:59:31 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=445) DEBUG 04-22 19:59:31 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +(Worker pid=445) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=445) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=444) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=444) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=444) DEBUG 04-22 19:59:31 [utils/nccl.py:34] Found nccl from library libnccl.so.2 +(Worker pid=444) INFO 04-22 19:59:31 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 +(Worker pid=445) DEBUG 04-22 19:59:32 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=444) DEBUG 04-22 19:59:32 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=444) DEBUG 04-22 19:59:32 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/3a64d419-020b-415e-9781-8f6479f087fc +(Worker pid=444) DEBUG 04-22 19:59:32 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1], buffer_handle=(1, 4194304, 6, 'psm_f3c4eead'), local_subscribe_addr='ipc:///tmp/3a64d419-020b-415e-9781-8f6479f087fc', local_notify_addr='ipc:///tmp/5abf5589-5384-4424-a5e8-51f14f549c6d', remote_subscribe_addr=None, remote_addr_ipv6=False) +(Worker pid=445) DEBUG 04-22 19:59:32 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/3a64d419-020b-415e-9781-8f6479f087fc +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +(Worker pid=444) INFO 04-22 19:59:32 [distributed/parallel_state.py:1716] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N/A +(Worker pid=444) DEBUG 04-22 19:59:32 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.18GiB, total_memory=79.19GiB, cuda_memory=2.01GiB, torch_memory=0.02GiB, non_torch_memory=1.99GiB, timestamp=1776887972.6513388, auto_measure=True +(Worker pid=444) DEBUG 04-22 19:59:32 [v1/worker/gpu_worker.py:285] worker requested memory: 71.27GiB +(Worker pid=445) DEBUG 04-22 19:59:32 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.18GiB, total_memory=79.19GiB, cuda_memory=2.01GiB, torch_memory=0.02GiB, non_torch_memory=1.99GiB, timestamp=1776887972.7254164, auto_measure=True +(Worker pid=445) DEBUG 04-22 19:59:32 [v1/worker/gpu_worker.py:285] worker requested memory: 71.27GiB +(Worker pid=444) DEBUG 04-22 19:59:32 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=444) DEBUG 04-22 19:59:32 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=444) DEBUG 04-22 19:59:32 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=444) DEBUG 04-22 19:59:32 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=444) DEBUG 04-22 19:59:32 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(Worker pid=444) DEBUG 04-22 19:59:32 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=445) DEBUG 04-22 19:59:32 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=445) DEBUG 04-22 19:59:32 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=445) DEBUG 04-22 19:59:32 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=444) DEBUG 04-22 19:59:32 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(Worker pid=444) DEBUG 04-22 19:59:32 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker_TP0 pid=444) INFO 04-22 19:59:32 [v1/worker/gpu_model_runner.py:4735] Starting to load model openai/gpt-oss-20b... +(Worker pid=445) DEBUG 04-22 19:59:32 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=445) DEBUG 04-22 19:59:32 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker_TP0 pid=444) DEBUG 04-22 19:59:33 [model_executor/.../quantization/mxfp4.py:75] MXFP4 linear layer is not implemented - falling back to UnquantizedLinearMethod. +(Worker_TP1 pid=445) DEBUG 04-22 19:59:33 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker_TP0 pid=444) DEBUG 04-22 19:59:33 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=64, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=True, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {FLASHINFER: [attention sinks not supported], FLEX_ATTENTION: [attention sinks not supported]}. +(Worker_TP0 pid=444) INFO 04-22 19:59:33 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'TRITON_ATTN']. +(Worker_TP0 pid=444) INFO 04-22 19:59:33 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(Worker_TP0 pid=444) DEBUG 04-22 19:59:33 [model_executor/.../quantization/mxfp4.py:84] MXFP4 attention layer is not implemented. Skipping quantization for this layer. +(Worker_TP0 pid=444) DEBUG 04-22 19:59:33 [model_executor/.../oracle/mxfp4.py:355] Mxfp4 MoE backend 'FLASHINFER_TRTLLM_MXFP4_BF16' does not support the deployment configuration since kernel does not support current device cuda. +(Worker_TP0 pid=444) DEBUG 04-22 19:59:33 [model_executor/.../oracle/mxfp4.py:355] Mxfp4 MoE backend 'CK' does not support the deployment configuration since kernel does not support current device cuda. +(Worker_TP0 pid=444) INFO 04-22 19:59:33 [model_executor/.../oracle/mxfp4.py:352] Using 'TRITON' Mxfp4 MoE backend. +(Worker_TP0 pid=444) DEBUG 04-22 19:59:33 [model_executor/.../runner/default_moe_runner.py:240] Enabled separate cuda stream for MoE shared_experts +(Worker_TP0 pid=444) DEBUG 04-22 19:59:33 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP0 pid=444) DEBUG 04-22 19:59:33 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP0 pid=444) DEBUG 04-22 19:59:33 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 49, 'fused_moe': 24, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP0 pid=444) DEBUG 04-22 19:59:33 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP1 pid=445) DEBUG 04-22 19:59:33 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP1 pid=445) DEBUG 04-22 19:59:33 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP1 pid=445) DEBUG 04-22 19:59:33 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 49, 'fused_moe': 24, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP1 pid=445) DEBUG 04-22 19:59:33 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP0 pid=444) DEBUG 04-22 19:59:33 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00000-of-00002.safetensors', 'model-00001-of-00002.safetensors', 'model-00002-of-00002.safetensors']] +(Worker_TP1 pid=445) DEBUG 04-22 19:59:33 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00001-of-00002.safetensors', 'model-00002-of-00002.safetensors', 'model-00000-of-00002.safetensors']] +(Worker_TP0 pid=444) Loading safetensors checkpoint shards: 0% Completed | 0/3 [00:00:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=245) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=245) DEBUG 04-22 20:00:11 [config/vllm.py:1530] Max num batched tokens below allreduce-rms fusion threshold, allreduce-rms fusion will be enabled for all num_tokens. +(EngineCore pid=245) INFO 04-22 20:00:11 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(Worker_TP1 pid=445) DEBUG 04-22 20:00:11 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=444) DEBUG 04-22 20:00:11 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=245) DEBUG 04-22 20:00:11 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=245) DEBUG 04-22 20:00:11 [v1/engine/core.py:1158] EngineCore waiting for work. +(EngineCore pid=245) DEBUG 04-22 20:00:11 [v1/engine/core.py:1158] EngineCore waiting for work. +(APIServer pid=1) INFO 04-22 20:00:11 [entrypoints/openai/api_server.py:590] Supported tasks: ['generate'] +(APIServer pid=1) WARNING 04-22 20:00:12 [entrypoints/.../responses/serving.py:233] For gpt-oss, we ignore --enable-auto-tool-choice and always enable tool use. +(Worker_TP0 pid=444) DEBUG 04-22 20:00:12 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=445) DEBUG 04-22 20:00:12 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(APIServer pid=1) DEBUG 04-22 20:00:12 [renderers/base.py:197] Warming up chat template processing... +(APIServer pid=1) DEBUG 04-22 20:00:12 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e928cc-3cd6682951500df46a152659;ecb8632e-5d0c-43fb-9ac4-d6b1de14410e) +(APIServer pid=1) DEBUG 04-22 20:00:12 [transformers_utils/repo_utils.py:243] +(APIServer pid=1) DEBUG 04-22 20:00:12 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/openai/gpt-oss-20b/resolve/main/processor_config.json. +(APIServer pid=1) DEBUG 04-22 20:00:13 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e928cc-1073a5747e1f81b31bf20d64;e3968c71-54c5-4bd5-b2a7-9b55999cfe2a) +(APIServer pid=1) DEBUG 04-22 20:00:13 [transformers_utils/repo_utils.py:243] +(APIServer pid=1) DEBUG 04-22 20:00:13 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/openai/gpt-oss-20b/resolve/main/preprocessor_config.json. +(APIServer pid=1) INFO 04-22 20:00:14 [renderers/hf.py:314] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this. +(APIServer pid=1) DEBUG 04-22 20:00:14 [renderers/base.py:203] Chat template warmup completed in 1.181s +(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/openai/api_server.py:594] Starting vLLM server on http://0.0.0.0:8000 +(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:37] Available routes are: +(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /openapi.json, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /docs, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /docs/oauth2-redirect, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /redoc, Methods: GET, HEAD +(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /tokenize, Methods: POST +(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /detokenize, Methods: POST +(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /load, Methods: GET +(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /version, Methods: GET +(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /health, Methods: GET +(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /metrics, Methods: GET +(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /v1/models, Methods: GET +(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /ping, Methods: GET +(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /ping, Methods: POST +(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /invocations, Methods: POST +(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /v1/chat/completions, Methods: POST +(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST +(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /v1/responses, Methods: POST +(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET +(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST +(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /v1/completions, Methods: POST +(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /v1/messages, Methods: POST +(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST +(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /inference/v1/generate, Methods: POST +(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /scale_elastic_ep, Methods: POST +(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST +(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /v1/chat/completions/render, Methods: POST +(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /v1/completions/render, Methods: POST +(APIServer pid=1) INFO: Started server process [1] +(APIServer pid=1) INFO: Waiting for application startup. +(APIServer pid=1) INFO: Application startup complete. +(APIServer pid=1) DEBUG 04-22 20:00:17 [v1/engine/async_llm.py:875] Called check_health. +(APIServer pid=1) INFO: 10.128.10.2:38542 - "GET /health HTTP/1.1" 200 OK diff --git a/accuracy/results/v0.19.0/logs/qwen-qwen1-5-moe-a2-7b--h100-80gb--tp1pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/qwen-qwen1-5-moe-a2-7b--h100-80gb--tp1pp1dp1--8192.log new file mode 100644 index 00000000..829c48eb --- /dev/null +++ b/accuracy/results/v0.19.0/logs/qwen-qwen1-5-moe-a2-7b--h100-80gb--tp1pp1dp1--8192.log @@ -0,0 +1,771 @@ +DEBUG 04-23 00:59:37 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-23 00:59:37 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-23 00:59:37 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-23 00:59:37 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-23 00:59:37 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-23 00:59:37 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-23 00:59:37 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-23 00:59:37 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-23 00:59:37 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-23 00:59:37 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-23 00:59:37 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-23 00:59:37 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-23 00:59:42 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-23 00:59:44 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' +DEBUG 04-23 00:59:44 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-23 00:59:44 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-23 00:59:44 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-23 00:59:44 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-23 00:59:44 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-23 00:59:44 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-23 00:59:44 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-23 00:59:44 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model Qwen/Qwen1.5-MoE-A2.7B +(APIServer pid=1) INFO 04-23 00:59:44 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-23 00:59:44 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-23 00:59:44 [entrypoints/utils.py:233] non-default args: {'model_tag': 'Qwen/Qwen1.5-MoE-A2.7B', 'model': 'Qwen/Qwen1.5-MoE-A2.7B', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-23 00:59:44 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-23 00:59:45 [model_executor/models/registry.py:774] Cached model info file for class vllm.model_executor.models.qwen2_moe.Qwen2MoeForCausalLM not found +(APIServer pid=1) DEBUG 04-23 00:59:45 [model_executor/models/registry.py:834] Cache model info for class vllm.model_executor.models.qwen2_moe.Qwen2MoeForCausalLM miss. Loading model instead. +(APIServer pid=1) DEBUG 04-23 00:59:54 [model_executor/models/registry.py:844] Loaded model info for class vllm.model_executor.models.qwen2_moe.Qwen2MoeForCausalLM +(APIServer pid=1) DEBUG 04-23 00:59:55 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 9.9489278 secs +(APIServer pid=1) INFO 04-23 00:59:55 [config/model.py:549] Resolved architecture: Qwen2MoeForCausalLM +(APIServer pid=1) INFO 04-23 00:59:55 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-23 00:59:55 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-23 00:59:55 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-23 00:59:55 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-23 00:59:55 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-23 00:59:55 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-23 00:59:55 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-23 00:59:55 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-23 00:59:55 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-23 00:59:55 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-23 00:59:57 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-23 00:59:57 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-23 01:00:01 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-23 01:00:01 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-23 01:00:01 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-23 01:00:01 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-23 01:00:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-23 01:00:01 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-23 01:00:01 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-23 01:00:01 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-23 01:00:01 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-23 01:00:01 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-23 01:00:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-23 01:00:01 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-23 01:00:06 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(APIServer pid=1) DEBUG 04-23 01:00:07 [v1/engine/utils.py:1042] Waiting for 1 local, 0 remote core engine proc(s) to connect. +(EngineCore pid=436) DEBUG 04-23 01:00:07 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-23 01:00:07 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=436) DEBUG 04-23 01:00:07 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/21edced6-b313-45ec-92ff-bf9809b8e7de'], outputs=['ipc:///tmp/db714d47-5168-4d47-9fd1-3221cefdb843'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=436) DEBUG 04-23 01:00:07 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=436) DEBUG 04-23 01:00:07 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=436) DEBUG 04-23 01:00:07 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=436) DEBUG 04-23 01:00:07 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=436) DEBUG 04-23 01:00:07 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=436) INFO 04-23 01:00:07 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='Qwen/Qwen1.5-MoE-A2.7B', speculative_config=None, tokenizer='Qwen/Qwen1.5-MoE-A2.7B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen1.5-MoE-A2.7B, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=436) DEBUG 04-23 01:00:08 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.130.6.25:50059 backend=nccl +(EngineCore pid=436) INFO 04-23 01:00:08 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.130.6.25:50059 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=436) DEBUG 04-23 01:00:08 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=436) INFO 04-23 01:00:08 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N/A +(EngineCore pid=436) DEBUG 04-23 01:00:08 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776906008.6471531, auto_measure=True +(EngineCore pid=436) DEBUG 04-23 01:00:08 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=436) DEBUG 04-23 01:00:08 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=436) DEBUG 04-23 01:00:08 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=436) DEBUG 04-23 01:00:08 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=436) DEBUG 04-23 01:00:08 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=436) DEBUG 04-23 01:00:08 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=436) DEBUG 04-23 01:00:08 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=436) DEBUG 04-23 01:00:08 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=436) DEBUG 04-23 01:00:08 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=436) INFO 04-23 01:00:08 [v1/worker/gpu_model_runner.py:4735] Starting to load model Qwen/Qwen1.5-MoE-A2.7B... +(EngineCore pid=436) DEBUG 04-23 01:00:09 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=436) INFO 04-23 01:00:09 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=436) INFO 04-23 01:00:09 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=436) INFO 04-23 01:00:09 [model_executor/.../oracle/unquantized.py:186] Using TRITON backend for Unquantized MoE +(EngineCore pid=436) DEBUG 04-23 01:00:09 [model_executor/.../runner/default_moe_runner.py:240] Enabled separate cuda stream for MoE shared_experts +(EngineCore pid=436) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=436) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=436) DEBUG 04-23 01:00:09 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=436) DEBUG 04-23 01:00:09 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=436) DEBUG 04-23 01:00:09 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 49, 'silu_and_mul': 24, 'fused_moe': 24, 'unquantized_fused_moe': 24, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=436) DEBUG 04-23 01:00:09 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=436) DEBUG 04-23 01:00:09 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00005-of-00008.safetensors', 'model-00008-of-00008.safetensors', 'model-00003-of-00008.safetensors', 'model-00007-of-00008.safetensors', 'model-00002-of-00008.safetensors', 'model-00001-of-00008.safetensors', 'model-00006-of-00008.safetensors', 'model-00004-of-00008.safetensors']] +(APIServer pid=1) DEBUG 04-23 01:00:17 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 01:00:27 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 01:00:37 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 01:00:47 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 01:00:57 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 01:01:07 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(APIServer pid=1) DEBUG 04-23 01:01:17 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=436) INFO 04-23 01:01:22 [model_executor/model_loader/weight_utils.py:581] Time spent downloading weights for Qwen/Qwen1.5-MoE-A2.7B: 72.140959 seconds +(EngineCore pid=436) Loading safetensors checkpoint shards: 0% Completed | 0/8 [00:00 +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/forward_context.py +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/config.py +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/shared_fused_moe.py +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2_moe.py +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=436) INFO 04-23 01:01:51 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/bf7cc57660/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=155e491f75 comp=e546579c48 code=18cfc43ea7c46403958dd7b7827107c640593b29e9880eadef1653adb7ec84a6 dir=/data/.cache/vllm/torch_compile_cache/bf7cc57660/rank_0_0/backbone +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] Vllm config hash: 155e491f75 +(EngineCore pid=436) INFO 04-23 01:01:51 [compilation/backends.py:1111] Dynamo bytecode transform time: 3.42 s +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=436) DEBUG 04-23 01:01:52 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=436) DEBUG 04-23 01:01:52 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(EngineCore pid=436) DEBUG 04-23 01:01:52 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms +(EngineCore pid=436) DEBUG 04-23 01:01:52 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=436) DEBUG 04-23 01:01:52 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=436) INFO 04-23 01:01:53 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=436) DEBUG 04-23 01:01:53 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/bf7cc57660/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=436) DEBUG 04-23 01:01:53 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=436) DEBUG 04-23 01:01:53 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(EngineCore pid=436) DEBUG 04-23 01:01:53 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms +(EngineCore pid=436) DEBUG 04-23 01:01:53 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=436) DEBUG 04-23 01:01:53 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=436) DEBUG 04-23 01:01:54 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/bf7cc57660/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=436) DEBUG 04-23 01:01:55 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=436) DEBUG 04-23 01:01:55 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms +(EngineCore pid=436) DEBUG 04-23 01:01:55 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms +(EngineCore pid=436) DEBUG 04-23 01:01:55 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=436) DEBUG 04-23 01:01:55 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(EngineCore pid=436) DEBUG 04-23 01:01:56 [compilation/backends.py:377] Store the 24-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_24', '/data/.cache/vllm/torch_compile_cache/bf7cc57660/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_24') +(EngineCore pid=436) INFO 04-23 01:01:56 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 4.27 s +(EngineCore pid=436) DEBUG 04-23 01:01:56 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/bf7cc57660/rank_0_0/backbone/computation_graph.py +(EngineCore pid=436) INFO 04-23 01:01:57 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/fe26b561311617008fabbb42b46538f4447d96637ecd092d3fde6e2428b1dd0a/rank_0_0/model +(EngineCore pid=436) INFO 04-23 01:01:57 [compilation/monitor.py:48] torch.compile took 8.98 s in total +(APIServer pid=1) DEBUG 04-23 01:01:57 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=436) WARNING 04-23 01:01:57 [model_executor/.../fused_moe/fused_moe.py:1090] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=NVIDIA_H100_80GB_HBM3.json +(EngineCore pid=436) INFO 04-23 01:01:58 [compilation/monitor.py:76] Initial profiling/warmup run took 1.65 s +(EngineCore pid=436) INFO 04-23 01:02:04 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=436) DEBUG 04-23 01:02:04 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=436) INFO 04-23 01:02:04 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=436) DEBUG 04-23 01:02:04 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=436) DEBUG 04-23 01:02:05 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=436) DEBUG 04-23 01:02:05 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=436) DEBUG 04-23 01:02:05 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=436) DEBUG 04-23 01:02:05 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=436) DEBUG 04-23 01:02:05 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=436) DEBUG 04-23 01:02:05 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 96.00 MiB first-capture + (51-1) × 8.00 MiB per-graph +(EngineCore pid=436) DEBUG 04-23 01:02:05 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=436) DEBUG 04-23 01:02:05 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=436) DEBUG 04-23 01:02:05 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=436) DEBUG 04-23 01:02:05 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=436) DEBUG 04-23 01:02:05 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=436) DEBUG 04-23 01:02:05 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=436) DEBUG 04-23 01:02:05 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 136.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=436) DEBUG 04-23 01:02:05 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=436) INFO 04-23 01:02:05 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.82 GiB total +(EngineCore pid=436) DEBUG 04-23 01:02:06 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=436) DEBUG 04-23 01:02:06 [v1/worker/gpu_worker.py:430] Free memory after profiling: 50.86 GiB (total), 47.41 GiB (within requested) +(EngineCore pid=436) DEBUG 04-23 01:02:06 [v1/worker/gpu_worker.py:435] Memory profiling takes 17.98 seconds. Total non KV cache memory: 29.4GiB; torch peak memory increase: 2.47GiB; non-torch forward increase memory: 0.25GiB; weights memory: 26.67GiB. +(EngineCore pid=436) INFO 04-23 01:02:06 [v1/worker/gpu_worker.py:436] Available KV cache memory: 45.83 GiB +(EngineCore pid=436) INFO 04-23 01:02:06 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9603 to maintain the same effective KV cache size. +(EngineCore pid=436) INFO 04-23 01:02:06 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 250,304 tokens +(EngineCore pid=436) INFO 04-23 01:02:06 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 30.55x +(EngineCore pid=436) 2026-04-23 01:02:06,159 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=436) DEBUG 04-23 01:02:06 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=436) 2026-04-23 01:02:06,186 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=436) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:12:17 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:12:17 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 00:12:17 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:12:17 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 00:12:17 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 00:12:17 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8 +(APIServer pid=1) INFO 04-22 00:12:17 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 00:12:17 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:12:17 [entrypoints/utils.py:233] non-default args: {'model_tag': 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8', 'model': 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 00:12:17 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 00:12:18 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.mistral3.Mistral3ForConditionalGeneration from cache +(APIServer pid=1) DEBUG 04-22 00:12:18 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0016757 secs +(APIServer pid=1) INFO 04-22 00:12:18 [config/model.py:549] Resolved architecture: Mistral3ForConditionalGeneration +(APIServer pid=1) INFO 04-22 00:12:18 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 00:12:18 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 00:12:18 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 00:12:18 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 00:12:18 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 00:12:18 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 00:12:18 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 00:12:18 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 00:12:18 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 00:12:18 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. +(APIServer pid=1) DEBUG 04-22 00:12:19 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:12:19 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. +(APIServer pid=1) DEBUG 04-22 00:12:19 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +(APIServer pid=1) The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. +(APIServer pid=1) The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. +DEBUG 04-22 00:12:27 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:12:27 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:12:27 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:12:27 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:12:27 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:12:27 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:12:27 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:12:27 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:12:27 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:12:27 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:12:27 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:12:27 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:12:32 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=468) DEBUG 04-22 00:12:34 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 00:12:34 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=468) DEBUG 04-22 00:12:34 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/ecc5eda6-a988-401c-90b3-adab9ce9c938'], outputs=['ipc:///tmp/89b6769d-3511-4956-9b04-41e6ab5c2ec1'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=468) DEBUG 04-22 00:12:34 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=468) DEBUG 04-22 00:12:34 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=468) DEBUG 04-22 00:12:34 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=468) DEBUG 04-22 00:12:34 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=468) DEBUG 04-22 00:12:34 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=468) INFO 04-22 00:12:34 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8', speculative_config=None, tokenizer='RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=468) DEBUG 04-22 00:12:34 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(EngineCore pid=468) The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. +(EngineCore pid=468) DEBUG 04-22 00:12:35 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.4.189:52183 backend=nccl +(EngineCore pid=468) INFO 04-22 00:12:35 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.4.189:52183 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=468) DEBUG 04-22 00:12:35 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=468) INFO 04-22 00:12:35 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=468) DEBUG 04-22 00:12:36 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776816756.0873954, auto_measure=True +(EngineCore pid=468) DEBUG 04-22 00:12:36 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=468) DEBUG 04-22 00:12:36 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=468) DEBUG 04-22 00:12:36 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=468) DEBUG 04-22 00:12:36 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=468) DEBUG 04-22 00:12:36 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=468) DEBUG 04-22 00:12:36 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=468) DEBUG 04-22 00:12:36 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=468) DEBUG 04-22 00:12:36 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. +(EngineCore pid=468) The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. +(EngineCore pid=468) The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. +(EngineCore pid=468) DEBUG 04-22 00:12:40 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=468) INFO 04-22 00:12:40 [v1/worker/gpu_model_runner.py:4735] Starting to load model RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8... +(EngineCore pid=468) INFO 04-22 00:12:41 [config/vllm.py:790] Asynchronous scheduling is enabled. +(EngineCore pid=468) DEBUG 04-22 00:12:41 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds', 't_cond'] +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.0.self_attn.qkv_proj +(EngineCore pid=468) INFO 04-22 00:12:41 [model_executor/.../linear/__init__.py:291] Selected CutlassInt8ScaledMMLinearKernel for CompressedTensorsW8A8Int8 +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.0.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=468) INFO 04-22 00:12:41 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=468) INFO 04-22 00:12:41 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.0.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.0.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.1.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.1.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.1.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.1.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.2.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.2.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.2.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.2.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.3.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.3.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.3.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.3.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.4.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.4.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.4.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.4.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.5.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.5.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.5.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.5.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.6.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.6.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.6.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.6.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.7.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.7.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.7.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.7.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.8.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.8.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.8.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.8.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.9.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.9.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.9.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.9.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.10.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.10.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.10.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.10.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.11.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.11.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.11.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.11.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.12.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.12.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.12.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.12.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.13.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.13.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.13.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.13.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.14.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.14.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.14.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.14.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.15.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.15.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.15.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.15.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.16.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.16.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.16.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.16.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.17.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.17.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.17.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.17.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.18.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.18.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.18.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.18.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.19.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.19.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.19.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.19.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.20.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.20.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.20.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.20.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.21.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.21.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.21.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.21.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.22.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.22.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.22.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.22.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.23.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.23.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.23.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.23.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.24.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.24.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.24.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.24.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.25.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.25.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.25.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.25.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.26.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.26.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.26.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.26.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.27.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.27.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.27.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.27.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.28.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.28.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.28.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.28.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.29.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.29.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.29.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.29.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.30.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.30.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.30.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.30.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.31.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.31.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.31.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.31.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.32.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.32.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.32.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.32.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.33.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.33.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.33.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.33.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.34.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.34.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.34.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.34.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.35.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.35.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.35.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.35.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.36.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.36.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.36.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.36.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.37.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.37.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.37.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.37.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.38.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.38.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.38.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.38.mlp.down_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.39.self_attn.qkv_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.39.self_attn.o_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.39.mlp.gate_up_proj +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.39.mlp.down_proj +(EngineCore pid=468) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=468) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=468) DEBUG 04-22 00:12:41 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=468) DEBUG 04-22 00:12:41 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=468) DEBUG 04-22 00:12:41 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=468) DEBUG 04-22 00:12:41 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 131, 'silu_and_mul': 40, 'conv2d': 1, 'gelu_and_mul': 1, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=468) DEBUG 04-22 00:12:41 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=468) DEBUG 04-22 00:12:41 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 131, 'silu_and_mul': 40, 'conv2d': 1, 'gelu_and_mul': 1, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00003-of-00006.safetensors', 'model-00005-of-00006.safetensors', 'model-00002-of-00006.safetensors', 'model-00006-of-00006.safetensors', 'model-00004-of-00006.safetensors', 'model-00001-of-00006.safetensors']] +(EngineCore pid=468) Loading safetensors checkpoint shards: 0% Completed | 0/6 [00:00 +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/mistral.py +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=468) INFO 04-22 00:13:11 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/327f08bb6e/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=a079653aeb comp=e546579c48 code=c676488f783f7fceec1e2b8e4d429af70dfba617895ac42a7314b1ea025a61d6 dir=/data/.cache/vllm/torch_compile_cache/327f08bb6e/rank_0_0/backbone +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] Vllm config hash: a079653aeb +(EngineCore pid=468) INFO 04-22 00:13:11 [compilation/backends.py:1111] Dynamo bytecode transform time: 6.84 s +(EngineCore pid=468) DEBUG 04-22 00:13:12 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=468) DEBUG 04-22 00:13:12 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=468) DEBUG 04-22 00:13:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=468) DEBUG 04-22 00:13:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms +(EngineCore pid=468) DEBUG 04-22 00:13:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms +(EngineCore pid=468) DEBUG 04-22 00:13:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=468) DEBUG 04-22 00:13:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(APIServer pid=1) DEBUG 04-22 00:13:14 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=468) INFO 04-22 00:13:15 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=468) DEBUG 04-22 00:13:15 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/327f08bb6e/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=468) DEBUG 04-22 00:13:15 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=468) DEBUG 04-22 00:13:15 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(EngineCore pid=468) DEBUG 04-22 00:13:15 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.5 ms +(EngineCore pid=468) DEBUG 04-22 00:13:15 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=468) DEBUG 04-22 00:13:15 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=468) DEBUG 04-22 00:13:17 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/327f08bb6e/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=468) DEBUG 04-22 00:13:17 [compilation/backends.py:377] Store the 2-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_2', '/data/.cache/vllm/torch_compile_cache/327f08bb6e/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_2') +(EngineCore pid=468) DEBUG 04-22 00:13:19 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=468) DEBUG 04-22 00:13:19 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.1 ms +(EngineCore pid=468) DEBUG 04-22 00:13:19 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.2 ms +(EngineCore pid=468) DEBUG 04-22 00:13:19 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=468) DEBUG 04-22 00:13:19 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(EngineCore pid=468) DEBUG 04-22 00:13:19 [compilation/backends.py:377] Store the 40-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_40', '/data/.cache/vllm/torch_compile_cache/327f08bb6e/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_40') +(EngineCore pid=468) INFO 04-22 00:13:19 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 7.45 s +(EngineCore pid=468) DEBUG 04-22 00:13:19 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/327f08bb6e/rank_0_0/backbone/computation_graph.py +(EngineCore pid=468) INFO 04-22 00:13:21 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/0fc0a88d52a7ba91dbb88a644ae64adcbc9dfc7015c5ff89047a59dc90fdd3ce/rank_0_0/model +(EngineCore pid=468) INFO 04-22 00:13:21 [compilation/monitor.py:48] torch.compile took 16.46 s in total +(EngineCore pid=468) INFO 04-22 00:13:22 [compilation/monitor.py:76] Initial profiling/warmup run took 0.77 s +(APIServer pid=1) DEBUG 04-22 00:13:24 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=468) INFO 04-22 00:13:27 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=468) DEBUG 04-22 00:13:27 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=468) INFO 04-22 00:13:27 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=468) DEBUG 04-22 00:13:28 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=468) DEBUG 04-22 00:13:28 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=468) DEBUG 04-22 00:13:28 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=468) DEBUG 04-22 00:13:28 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=468) DEBUG 04-22 00:13:28 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=468) DEBUG 04-22 00:13:28 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=468) DEBUG 04-22 00:13:28 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 198.00 MiB first-capture + (51-1) × 8.00 MiB per-graph +(EngineCore pid=468) DEBUG 04-22 00:13:28 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=468) DEBUG 04-22 00:13:28 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=468) DEBUG 04-22 00:13:28 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=468) DEBUG 04-22 00:13:28 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=468) DEBUG 04-22 00:13:28 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=468) DEBUG 04-22 00:13:28 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=468) DEBUG 04-22 00:13:28 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 262.00 MiB first-capture + (51-1) × 8.00 MiB per-graph +(EngineCore pid=468) DEBUG 04-22 00:13:28 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=468) INFO 04-22 00:13:28 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.04 GiB total +(EngineCore pid=468) DEBUG 04-22 00:13:29 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=468) DEBUG 04-22 00:13:29 [v1/worker/gpu_worker.py:430] Free memory after profiling: 54.11 GiB (total), 50.66 GiB (within requested) +(EngineCore pid=468) DEBUG 04-22 00:13:29 [v1/worker/gpu_worker.py:435] Memory profiling takes 24.83 seconds. Total non KV cache memory: 26.5GiB; torch peak memory increase: 2.18GiB; non-torch forward increase memory: 0.26GiB; weights memory: 24.07GiB. +(EngineCore pid=468) INFO 04-22 00:13:29 [v1/worker/gpu_worker.py:436] Available KV cache memory: 48.73 GiB +(EngineCore pid=468) INFO 04-22 00:13:29 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9631 to maintain the same effective KV cache size. +(EngineCore pid=468) INFO 04-22 00:13:29 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 319,344 tokens +(EngineCore pid=468) INFO 04-22 00:13:29 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 38.98x +(EngineCore pid=468) 2026-04-22 00:13:29,131 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=468) DEBUG 04-22 00:13:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=468) 2026-04-22 00:13:29,143 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=468) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:14:02 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:14:02 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 00:14:02 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:14:02 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 00:14:02 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 00:14:02 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8 +(APIServer pid=1) INFO 04-22 00:14:02 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 00:14:02 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:14:02 [entrypoints/utils.py:233] non-default args: {'model_tag': 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8', 'model': 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8', 'max_model_len': 8192, 'tensor_parallel_size': 2, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 00:14:02 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 00:14:03 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.mistral3.Mistral3ForConditionalGeneration from cache +(APIServer pid=1) DEBUG 04-22 00:14:03 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0014510 secs +(APIServer pid=1) INFO 04-22 00:14:03 [config/model.py:549] Resolved architecture: Mistral3ForConditionalGeneration +(APIServer pid=1) INFO 04-22 00:14:03 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 00:14:03 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 00:14:03 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 00:14:03 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 00:14:03 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 00:14:03 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 00:14:03 [config/parallel.py:743] Defaulting to use mp for distributed inference +(APIServer pid=1) INFO 04-22 00:14:03 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 00:14:03 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(APIServer pid=1) INFO 04-22 00:14:04 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(APIServer pid=1) DEBUG 04-22 00:14:04 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 00:14:04 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. +(APIServer pid=1) DEBUG 04-22 00:14:05 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:14:05 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. +(APIServer pid=1) DEBUG 04-22 00:14:05 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +(APIServer pid=1) The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. +(APIServer pid=1) The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. +DEBUG 04-22 00:14:14 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:14:14 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:14:14 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:14:14 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:14:14 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:14:14 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:14:14 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:14:14 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:14:14 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:14:14 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:14:14 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:14:14 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:14:19 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(APIServer pid=1) DEBUG 04-22 00:14:20 [v1/engine/utils.py:1042] Waiting for 1 local, 0 remote core engine proc(s) to connect. +(EngineCore pid=469) DEBUG 04-22 00:14:20 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 00:14:20 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=469) DEBUG 04-22 00:14:20 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/d25f1ed3-6fc0-4d15-8be0-03802292156b'], outputs=['ipc:///tmp/87c7c8cd-7976-4324-bead-27df712c1084'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=469) DEBUG 04-22 00:14:20 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=469) DEBUG 04-22 00:14:20 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=469) DEBUG 04-22 00:14:20 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=469) DEBUG 04-22 00:14:20 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=469) DEBUG 04-22 00:14:20 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=469) INFO 04-22 00:14:20 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8', speculative_config=None, tokenizer='RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [6553, 8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=469) WARNING 04-22 00:14:20 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. +(EngineCore pid=469) INFO 04-22 00:14:20 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.129.9.25 (local), world_size=2, local_world_size=2 +(EngineCore pid=469) DEBUG 04-22 00:14:20 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/6c93d54c-bb41-42ff-b22e-c8895b3ada8f +(EngineCore pid=469) DEBUG 04-22 00:14:20 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1], buffer_handle=(2, 16777216, 10, 'psm_873c0090'), local_subscribe_addr='ipc:///tmp/6c93d54c-bb41-42ff-b22e-c8895b3ada8f', local_notify_addr='ipc:///tmp/089c5772-3312-4148-b8a6-d9a896842e06', remote_subscribe_addr=None, remote_addr_ipv6=False) +DEBUG 04-22 00:14:24 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:14:24 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:14:24 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:14:24 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:14:24 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:14:24 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:14:24 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:14:24 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:14:24 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:14:24 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:14:24 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:14:24 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:14:24 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:14:24 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:14:24 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:14:24 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:14:24 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:14:24 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:14:24 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:14:24 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:14:24 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:14:24 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:14:24 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:14:24 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:14:29 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 00:14:29 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 00:14:30 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 00:14:30 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:14:30 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:14:30 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +DEBUG 04-22 00:14:30 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 00:14:30 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:14:30 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:14:30 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) DEBUG 04-22 00:14:30 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +DEBUG 04-22 00:14:31 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +DEBUG 04-22 00:14:31 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. +The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. +(Worker pid=668) DEBUG 04-22 00:14:32 [distributed/parallel_state.py:1356] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:56531 backend=nccl +(Worker pid=668) INFO 04-22 00:14:32 [distributed/parallel_state.py:1400] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:56531 backend=nccl +(Worker pid=669) DEBUG 04-22 00:14:32 [distributed/parallel_state.py:1356] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:56531 backend=nccl +(Worker pid=669) INFO 04-22 00:14:32 [distributed/parallel_state.py:1400] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:56531 backend=nccl +[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +(Worker pid=669) DEBUG 04-22 00:14:32 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +(Worker pid=668) DEBUG 04-22 00:14:32 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 +(Worker pid=668) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=668) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=668) DEBUG 04-22 00:14:33 [utils/nccl.py:34] Found nccl from library libnccl.so.2 +(Worker pid=668) INFO 04-22 00:14:33 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 +(Worker pid=669) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(Worker pid=669) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(Worker pid=669) DEBUG 04-22 00:14:34 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=668) DEBUG 04-22 00:14:34 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. +(Worker pid=668) DEBUG 04-22 00:14:34 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/c98dad95-8292-4c41-9cae-565225952c2b +(Worker pid=668) DEBUG 04-22 00:14:34 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1], buffer_handle=(1, 4194304, 6, 'psm_df50e73a'), local_subscribe_addr='ipc:///tmp/c98dad95-8292-4c41-9cae-565225952c2b', local_notify_addr='ipc:///tmp/c21e2440-2fd2-47d3-a452-a26a1e6d2c8c', remote_subscribe_addr=None, remote_addr_ipv6=False) +(Worker pid=669) DEBUG 04-22 00:14:34 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/c98dad95-8292-4c41-9cae-565225952c2b +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(Worker pid=668) INFO 04-22 00:14:34 [distributed/parallel_state.py:1716] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(Worker pid=669) DEBUG 04-22 00:14:34 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.66GiB, total_memory=79.19GiB, cuda_memory=1.53GiB, torch_memory=0.02GiB, non_torch_memory=1.51GiB, timestamp=1776816874.35419, auto_measure=True +(Worker pid=669) DEBUG 04-22 00:14:34 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=668) DEBUG 04-22 00:14:34 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.66GiB, total_memory=79.19GiB, cuda_memory=1.53GiB, torch_memory=0.02GiB, non_torch_memory=1.51GiB, timestamp=1776816874.3785136, auto_measure=True +(Worker pid=668) DEBUG 04-22 00:14:34 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(Worker pid=669) DEBUG 04-22 00:14:34 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=669) DEBUG 04-22 00:14:34 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=669) DEBUG 04-22 00:14:34 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=668) DEBUG 04-22 00:14:34 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=668) DEBUG 04-22 00:14:34 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(Worker pid=668) DEBUG 04-22 00:14:34 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(Worker pid=669) DEBUG 04-22 00:14:34 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=669) DEBUG 04-22 00:14:34 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=668) DEBUG 04-22 00:14:34 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(Worker pid=668) DEBUG 04-22 00:14:34 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(Worker pid=668) DEBUG 04-22 00:14:34 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(Worker pid=669) The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. +(Worker pid=668) The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. +(Worker pid=669) The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. +(Worker pid=668) The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. +(Worker pid=668) DEBUG 04-22 00:14:39 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(Worker_TP0 pid=668) INFO 04-22 00:14:39 [v1/worker/gpu_model_runner.py:4735] Starting to load model RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8... +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds', 't_cond'] +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.0.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.0.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.0.mlp.gate_up_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.0.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.1.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.1.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.1.mlp.gate_up_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.1.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.2.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.2.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.2.mlp.gate_up_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.2.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.3.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.3.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.3.mlp.gate_up_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.3.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.4.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.4.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.4.mlp.gate_up_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.4.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.5.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.5.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.5.mlp.gate_up_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.5.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.6.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.6.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.6.mlp.gate_up_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.6.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.7.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.7.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.7.mlp.gate_up_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.7.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.8.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.8.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.8.mlp.gate_up_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.8.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.9.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.9.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.9.mlp.gate_up_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.9.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.10.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.10.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.10.mlp.gate_up_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.10.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.11.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.11.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.11.mlp.gate_up_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.11.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.12.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.12.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.12.mlp.gate_up_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.12.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.13.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.13.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.13.mlp.gate_up_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.13.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.14.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.14.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.14.mlp.gate_up_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.14.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.15.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.15.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.15.mlp.gate_up_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.15.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.16.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.16.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.16.mlp.gate_up_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.16.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.17.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.17.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.17.mlp.gate_up_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.17.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.18.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.18.self_attn.o_proj +(Worker_TP0 pid=668) INFO 04-22 00:14:39 [config/vllm.py:790] Asynchronous scheduling is enabled. +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.18.mlp.gate_up_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.18.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.19.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.19.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.19.mlp.gate_up_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.19.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.20.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.20.self_attn.o_proj +(Worker_TP0 pid=668) INFO 04-22 00:14:39 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.20.mlp.gate_up_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.20.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds', 't_cond'] +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.0.self_attn.qkv_proj +(Worker_TP0 pid=668) INFO 04-22 00:14:39 [model_executor/.../linear/__init__.py:291] Selected CutlassInt8ScaledMMLinearKernel for CompressedTensorsW8A8Int8 +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.21.self_attn.qkv_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.0.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.21.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.21.mlp.gate_up_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.21.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.22.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.22.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.22.mlp.gate_up_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.22.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.23.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.23.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.23.mlp.gate_up_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.23.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.24.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.24.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.24.mlp.gate_up_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.24.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.25.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.25.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.25.mlp.gate_up_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.25.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.26.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.26.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(Worker_TP0 pid=668) INFO 04-22 00:14:39 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(Worker_TP0 pid=668) INFO 04-22 00:14:39 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.26.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.0.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.0.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.26.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.1.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.27.self_attn.qkv_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.1.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.27.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.1.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.1.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.27.mlp.gate_up_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.27.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.2.self_attn.qkv_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.2.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.28.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.28.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.2.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.2.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.28.mlp.gate_up_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.28.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.3.self_attn.qkv_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.3.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.3.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.3.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.29.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.29.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.29.mlp.gate_up_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.29.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.4.self_attn.qkv_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.4.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.4.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.4.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.30.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.30.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.30.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.5.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.30.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.5.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.5.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.5.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.31.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.31.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.31.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.6.self_attn.qkv_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.6.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.31.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.6.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.6.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.32.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.32.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.32.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.7.self_attn.qkv_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.7.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.32.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.7.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.7.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.33.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.33.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.8.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.33.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.8.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.33.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.8.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.8.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.34.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.34.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.9.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.34.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.9.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.34.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.9.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.9.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.35.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.35.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.10.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.35.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.10.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.35.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.10.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.10.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.36.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.36.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.11.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.36.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.11.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.36.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.11.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.11.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.37.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.37.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.12.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.37.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.12.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.37.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.12.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.12.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.38.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.38.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.13.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.38.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.13.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.38.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.13.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.13.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.39.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.39.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.14.self_attn.qkv_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.39.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.14.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.39.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.14.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.14.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.15.self_attn.qkv_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.15.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.15.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.15.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.16.self_attn.qkv_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.16.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.16.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.16.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.17.self_attn.qkv_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.17.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.17.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.17.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.18.self_attn.qkv_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.18.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.18.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.18.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.19.self_attn.qkv_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.19.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.19.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.19.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.20.self_attn.qkv_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.20.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.20.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.20.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.21.self_attn.qkv_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.21.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.21.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.21.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.22.self_attn.qkv_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.22.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.22.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.22.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.23.self_attn.qkv_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.23.self_attn.o_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.23.mlp.gate_up_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.23.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.24.self_attn.qkv_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.24.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.24.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.24.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.25.self_attn.qkv_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.25.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.25.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.25.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 131, 'silu_and_mul': 40, 'conv2d': 1, 'gelu_and_mul': 1, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.26.self_attn.qkv_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.26.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.26.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.26.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.27.self_attn.qkv_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.27.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.27.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.27.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.28.self_attn.qkv_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.28.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.28.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.28.mlp.down_proj +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 131, 'silu_and_mul': 40, 'conv2d': 1, 'gelu_and_mul': 1, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.29.self_attn.qkv_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.29.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.29.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.29.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.30.self_attn.qkv_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.30.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.30.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.30.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.31.self_attn.qkv_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.31.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.31.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.31.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.32.self_attn.qkv_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.32.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.32.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.32.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.33.self_attn.qkv_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.33.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.33.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.33.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.34.self_attn.qkv_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.34.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.34.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.34.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.35.self_attn.qkv_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.35.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.35.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.35.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.36.self_attn.qkv_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.36.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.36.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.36.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.37.self_attn.qkv_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.37.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.37.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.37.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.38.self_attn.qkv_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.38.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.38.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.38.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.39.self_attn.qkv_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.39.self_attn.o_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.39.mlp.gate_up_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.39.mlp.down_proj +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 131, 'silu_and_mul': 40, 'conv2d': 1, 'gelu_and_mul': 1, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [config/compilation.py:1194] enabled custom ops: Counter() +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 131, 'silu_and_mul': 40, 'conv2d': 1, 'gelu_and_mul': 1, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00006.safetensors', 'model-00001-of-00006.safetensors', 'model-00003-of-00006.safetensors', 'model-00005-of-00006.safetensors', 'model-00006-of-00006.safetensors', 'model-00004-of-00006.safetensors']] +(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00004-of-00006.safetensors', 'model-00003-of-00006.safetensors', 'model-00001-of-00006.safetensors', 'model-00002-of-00006.safetensors', 'model-00006-of-00006.safetensors', 'model-00005-of-00006.safetensors']] +(Worker_TP0 pid=668) Loading safetensors checkpoint shards: 0% Completed | 0/6 [00:00 +(Worker_TP1 pid=669) DEBUG 04-22 00:14:59 [compilation/decorators.py:528] Start compiling function +(EngineCore pid=469) DEBUG 04-22 00:15:00 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(APIServer pid=1) DEBUG 04-22 00:15:00 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/mistral.py +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=041fbb8292 comp=e546579c48 code=fdff84f9a2f10e49a795b178e6bd05497bf710b53e4ed33c50d0ecc2ca179fc3 dir=/data/.cache/vllm/torch_compile_cache/14c0de09a2/rank_1_0/backbone +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] Vllm config hash: 041fbb8292 +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/mistral.py +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(Worker_TP0 pid=668) INFO 04-22 00:15:06 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/14c0de09a2/rank_0_0/backbone for vLLM's torch.compile +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=041fbb8292 comp=e546579c48 code=fdff84f9a2f10e49a795b178e6bd05497bf710b53e4ed33c50d0ecc2ca179fc3 dir=/data/.cache/vllm/torch_compile_cache/14c0de09a2/rank_0_0/backbone +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] Compile env factors (raw): +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'CUDA_HOME': None, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VERBOSE': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] Vllm config hash: 041fbb8292 +(Worker_TP0 pid=668) INFO 04-22 00:15:06 [compilation/backends.py:1111] Dynamo bytecode transform time: 6.95 s +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/.../fusion/allreduce_rms_fusion.py:765] Flashinfer max size: 64 MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: 6553 +(Worker_TP0 pid=668) INFO 04-22 00:15:06 [distributed/device_communicators/flashinfer_all_reduce.py:109] Auto-selected flashinfer allreduce backend: trtllm +(Worker_TP0 pid=668) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. +(Worker_TP0 pid=668) return func(*args, **kwargs) +(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=0, max_token_num=6553, hidden_dim=5120, dtype=torch.bfloat16 +(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=1, max_token_num=6553, hidden_dim=5120, dtype=torch.bfloat16 +(Worker_TP0 pid=668) INFO 04-22 00:15:06 [distributed/device_communicators/flashinfer_all_reduce.py:149] Initialized FlashInfer Allreduce norm fusion workspace with backend=trtllm +(Worker_TP0 pid=668) DEBUG 04-22 00:15:07 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 6553), (6554, 8192)] +(Worker_TP0 pid=668) DEBUG 04-22 00:15:07 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(Worker_TP0 pid=668) DEBUG 04-22 00:15:08 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=668) DEBUG 04-22 00:15:08 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP0 pid=668) DEBUG 04-22 00:15:08 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 0 patterns +(Worker_TP0 pid=668) DEBUG 04-22 00:15:08 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 1.0 ms +(Worker_TP0 pid=668) DEBUG 04-22 00:15:08 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=668) DEBUG 04-22 00:15:08 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=668) DEBUG 04-22 00:15:08 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP1 pid=669) DEBUG 04-22 00:15:08 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=669) DEBUG 04-22 00:15:08 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP1 pid=669) DEBUG 04-22 00:15:08 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 0 patterns +(Worker_TP1 pid=669) DEBUG 04-22 00:15:08 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 1.1 ms +(Worker_TP1 pid=669) DEBUG 04-22 00:15:08 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=669) DEBUG 04-22 00:15:08 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=669) DEBUG 04-22 00:15:08 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=668) INFO 04-22 00:15:10 [compilation/backends.py:372] Cache the graph of compile range (1, 6553) for later use +(Worker_TP0 pid=668) DEBUG 04-22 00:15:10 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 6553) from inductor_standalone via handle ('artifact_compile_range_1_6553_subgraph_0', '/data/.cache/vllm/torch_compile_cache/14c0de09a2/rank_0_0/backbone/artifact_compile_range_1_6553_subgraph_0') +(APIServer pid=1) DEBUG 04-22 00:15:10 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP1 pid=669) DEBUG 04-22 00:15:10 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=669) DEBUG 04-22 00:15:10 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP1 pid=669) DEBUG 04-22 00:15:10 [compilation/passes/pass_manager.py:100] Skipping with compile range (6554, 8192) +(Worker_TP1 pid=669) DEBUG 04-22 00:15:10 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=669) DEBUG 04-22 00:15:10 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=669) DEBUG 04-22 00:15:10 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=668) DEBUG 04-22 00:15:10 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=668) DEBUG 04-22 00:15:10 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP0 pid=668) DEBUG 04-22 00:15:10 [compilation/passes/pass_manager.py:100] Skipping with compile range (6554, 8192) +(Worker_TP0 pid=668) DEBUG 04-22 00:15:10 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=668) DEBUG 04-22 00:15:10 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=668) DEBUG 04-22 00:15:10 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=668) INFO 04-22 00:15:11 [compilation/backends.py:372] Cache the graph of compile range (6554, 8192) for later use +(Worker_TP0 pid=668) DEBUG 04-22 00:15:11 [compilation/backends.py:377] Store the 0-th graph for compile range(6554, 8192) from inductor_standalone via handle ('artifact_compile_range_6554_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/14c0de09a2/rank_0_0/backbone/artifact_compile_range_6554_8192_subgraph_0') +(Worker_TP1 pid=669) DEBUG 04-22 00:15:11 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=669) DEBUG 04-22 00:15:11 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.0 ms +(Worker_TP0 pid=668) DEBUG 04-22 00:15:11 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=668) DEBUG 04-22 00:15:11 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms +(Worker_TP1 pid=669) DEBUG 04-22 00:15:11 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP1 pid=669) DEBUG 04-22 00:15:11 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 41.2 ms +(Worker_TP1 pid=669) DEBUG 04-22 00:15:11 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP1 pid=669) DEBUG 04-22 00:15:11 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP1 pid=669) DEBUG 04-22 00:15:11 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(Worker_TP0 pid=668) DEBUG 04-22 00:15:11 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP0 pid=668) DEBUG 04-22 00:15:11 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 41.6 ms +(Worker_TP0 pid=668) DEBUG 04-22 00:15:11 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP0 pid=668) DEBUG 04-22 00:15:11 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes +(Worker_TP0 pid=668) DEBUG 04-22 00:15:11 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.6 ms +(Worker_TP0 pid=668) DEBUG 04-22 00:15:12 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 6553) from inductor_standalone via handle ('artifact_compile_range_1_6553_subgraph_1', '/data/.cache/vllm/torch_compile_cache/14c0de09a2/rank_0_0/backbone/artifact_compile_range_1_6553_subgraph_1') +(Worker_TP1 pid=669) DEBUG 04-22 00:15:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=669) DEBUG 04-22 00:15:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms +(Worker_TP1 pid=669) DEBUG 04-22 00:15:13 [compilation/passes/pass_manager.py:100] Skipping with compile range (6554, 8192) +(Worker_TP1 pid=669) DEBUG 04-22 00:15:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.5 ms +(Worker_TP1 pid=669) DEBUG 04-22 00:15:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=669) DEBUG 04-22 00:15:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=668) DEBUG 04-22 00:15:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=668) DEBUG 04-22 00:15:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP0 pid=668) DEBUG 04-22 00:15:13 [compilation/passes/pass_manager.py:100] Skipping with compile range (6554, 8192) +(Worker_TP0 pid=668) DEBUG 04-22 00:15:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms +(Worker_TP0 pid=668) DEBUG 04-22 00:15:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=668) DEBUG 04-22 00:15:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=668) DEBUG 04-22 00:15:14 [compilation/backends.py:377] Store the 1-th graph for compile range(6554, 8192) from inductor_standalone via handle ('artifact_compile_range_6554_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/14c0de09a2/rank_0_0/backbone/artifact_compile_range_6554_8192_subgraph_1') +(Worker_TP0 pid=668) DEBUG 04-22 00:15:15 [compilation/backends.py:377] Store the 2-th graph for compile range(1, 6553) from inductor_standalone via handle ('artifact_compile_range_1_6553_subgraph_2', '/data/.cache/vllm/torch_compile_cache/14c0de09a2/rank_0_0/backbone/artifact_compile_range_1_6553_subgraph_2') +(Worker_TP0 pid=668) DEBUG 04-22 00:15:15 [compilation/backends.py:377] Store the 2-th graph for compile range(6554, 8192) from inductor_standalone via handle ('artifact_compile_range_6554_8192_subgraph_2', '/data/.cache/vllm/torch_compile_cache/14c0de09a2/rank_0_0/backbone/artifact_compile_range_6554_8192_subgraph_2') +(Worker_TP1 pid=669) DEBUG 04-22 00:15:19 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=669) DEBUG 04-22 00:15:19 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP1 pid=669) DEBUG 04-22 00:15:19 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP1 pid=669) DEBUG 04-22 00:15:19 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 40.1 ms +(Worker_TP1 pid=669) DEBUG 04-22 00:15:19 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP1 pid=669) DEBUG 04-22 00:15:19 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP1 pid=669) DEBUG 04-22 00:15:19 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=668) DEBUG 04-22 00:15:19 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=668) DEBUG 04-22 00:15:19 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms +(Worker_TP0 pid=668) DEBUG 04-22 00:15:19 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns +(Worker_TP0 pid=668) DEBUG 04-22 00:15:19 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 40.2 ms +(Worker_TP0 pid=668) DEBUG 04-22 00:15:19 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms +(Worker_TP0 pid=668) DEBUG 04-22 00:15:19 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes +(Worker_TP0 pid=668) DEBUG 04-22 00:15:19 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms +(Worker_TP0 pid=668) DEBUG 04-22 00:15:19 [compilation/backends.py:377] Store the 40-th graph for compile range(1, 6553) from inductor_standalone via handle ('artifact_compile_range_1_6553_subgraph_40', '/data/.cache/vllm/torch_compile_cache/14c0de09a2/rank_0_0/backbone/artifact_compile_range_1_6553_subgraph_40') +(Worker_TP0 pid=668) INFO 04-22 00:15:19 [compilation/backends.py:390] Compiling a graph for compile range (1, 6553) takes 9.11 s +(Worker_TP1 pid=669) DEBUG 04-22 00:15:19 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP1 pid=669) DEBUG 04-22 00:15:19 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP1 pid=669) DEBUG 04-22 00:15:19 [compilation/passes/pass_manager.py:100] Skipping with compile range (6554, 8192) +(Worker_TP1 pid=669) DEBUG 04-22 00:15:19 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP1 pid=669) DEBUG 04-22 00:15:19 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP1 pid=669) DEBUG 04-22 00:15:19 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP0 pid=668) DEBUG 04-22 00:15:19 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(Worker_TP0 pid=668) DEBUG 04-22 00:15:19 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(Worker_TP0 pid=668) DEBUG 04-22 00:15:19 [compilation/passes/pass_manager.py:100] Skipping with compile range (6554, 8192) +(Worker_TP0 pid=668) DEBUG 04-22 00:15:19 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms +(Worker_TP0 pid=668) DEBUG 04-22 00:15:19 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(Worker_TP0 pid=668) DEBUG 04-22 00:15:19 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms +(Worker_TP0 pid=668) DEBUG 04-22 00:15:20 [compilation/backends.py:377] Store the 40-th graph for compile range(6554, 8192) from inductor_standalone via handle ('artifact_compile_range_6554_8192_subgraph_40', '/data/.cache/vllm/torch_compile_cache/14c0de09a2/rank_0_0/backbone/artifact_compile_range_6554_8192_subgraph_40') +(Worker_TP0 pid=668) INFO 04-22 00:15:20 [compilation/backends.py:390] Compiling a graph for compile range (6554, 8192) takes 10.21 s +(APIServer pid=1) DEBUG 04-22 00:15:20 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=668) DEBUG 04-22 00:15:20 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/14c0de09a2/rank_0_0/backbone/computation_graph.py +(Worker_TP0 pid=668) INFO 04-22 00:15:22 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/85f3ac149fb92a20bbf4acfb1db1d6d2441b76e5d6bb81e4e5b7eb41ef3d9031/rank_0_0/model +(Worker_TP0 pid=668) INFO 04-22 00:15:22 [compilation/monitor.py:48] torch.compile took 22.93 s in total +(Worker_TP0 pid=668) INFO 04-22 00:15:23 [compilation/monitor.py:76] Initial profiling/warmup run took 0.63 s +(Worker_TP0 pid=668) INFO 04-22 00:15:28 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP0 pid=668) DEBUG 04-22 00:15:28 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP0 pid=668) INFO 04-22 00:15:28 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_TP1 pid=669) INFO 04-22 00:15:28 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(Worker_TP1 pid=669) DEBUG 04-22 00:15:28 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(Worker_TP1 pid=669) INFO 04-22 00:15:28 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(Worker_TP0 pid=668) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=669) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=669) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=668) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=669) DEBUG 04-22 00:15:29 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=668) DEBUG 04-22 00:15:29 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=669) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=668) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=669) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=669) DEBUG 04-22 00:15:29 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=668) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=668) DEBUG 04-22 00:15:29 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(Worker_TP1 pid=669) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 100.00 MiB first-capture + (51-1) × 24.00 MiB per-graph +(Worker_TP1 pid=669) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=668) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 100.00 MiB first-capture + (51-1) × 24.00 MiB per-graph +(Worker_TP0 pid=668) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=669) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=669) DEBUG 04-22 00:15:29 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=668) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=668) DEBUG 04-22 00:15:29 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=668) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=669) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=669) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=669) DEBUG 04-22 00:15:29 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=668) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=668) DEBUG 04-22 00:15:29 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(Worker_TP0 pid=668) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 136.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(Worker_TP1 pid=669) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 136.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(Worker_TP0 pid=668) INFO 04-22 00:15:29 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses +(Worker_TP1 pid=669) INFO 04-22 00:15:29 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses +(Worker_TP0 pid=668) DEBUG 04-22 00:15:30 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP0 pid=668) INFO 04-22 00:15:30 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.60 GiB total +(Worker_TP1 pid=669) DEBUG 04-22 00:15:30 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(Worker_TP1 pid=669) INFO 04-22 00:15:30 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.60 GiB total +(Worker_TP0 pid=668) DEBUG 04-22 00:15:30 [v1/worker/gpu_worker.py:424] Initial free memory: 77.66 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP0 pid=668) DEBUG 04-22 00:15:30 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.25 GiB (total), 59.82 GiB (within requested) +(Worker_TP0 pid=668) DEBUG 04-22 00:15:30 [v1/worker/gpu_worker.py:435] Memory profiling takes 31.49 seconds. Total non KV cache memory: 16.21GiB; torch peak memory increase: 2.03GiB; non-torch forward increase memory: 2.07GiB; weights memory: 12.11GiB. +(Worker_TP0 pid=668) INFO 04-22 00:15:30 [v1/worker/gpu_worker.py:436] Available KV cache memory: 59.02 GiB +(Worker_TP0 pid=668) INFO 04-22 00:15:30 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9702 to maintain the same effective KV cache size. +(Worker_TP0 pid=668) DEBUG 04-22 00:15:30 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=469) DEBUG 04-22 00:15:30 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=469) DEBUG 04-22 00:15:30 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=669) DEBUG 04-22 00:15:30 [v1/worker/gpu_worker.py:424] Initial free memory: 77.66 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(Worker_TP1 pid=669) DEBUG 04-22 00:15:30 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.25 GiB (total), 59.82 GiB (within requested) +(Worker_TP1 pid=669) DEBUG 04-22 00:15:30 [v1/worker/gpu_worker.py:435] Memory profiling takes 31.45 seconds. Total non KV cache memory: 16.21GiB; torch peak memory increase: 2.03GiB; non-torch forward increase memory: 2.07GiB; weights memory: 12.11GiB. +(Worker_TP1 pid=669) INFO 04-22 00:15:30 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9702 to maintain the same effective KV cache size. +(Worker_TP1 pid=669) DEBUG 04-22 00:15:30 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=469) DEBUG 04-22 00:15:30 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=469) INFO 04-22 00:15:30 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 773,584 tokens +(EngineCore pid=469) INFO 04-22 00:15:30 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 94.43x +(Worker_TP0 pid=668) DEBUG 04-22 00:15:30 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=669) DEBUG 04-22 00:15:30 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=668) INFO 04-22 00:15:30 [v1/worker/gpu_worker.py:578] Compile and warming up model for size 8192 +(Worker_TP1 pid=669) INFO 04-22 00:15:30 [v1/worker/gpu_worker.py:578] Compile and warming up model for size 8192 +(Worker_TP0 pid=668) DEBUG 04-22 00:15:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=669) DEBUG 04-22 00:15:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=668) 2026-04-22 00:15:30,826 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP1 pid=669) 2026-04-22 00:15:30,826 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(Worker_TP0 pid=668) DEBUG 04-22 00:15:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP1 pid=669) DEBUG 04-22 00:15:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(APIServer pid=1) DEBUG 04-22 00:15:30 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(Worker_TP0 pid=668) 2026-04-22 00:15:30,880 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP1 pid=669) 2026-04-22 00:15:30,880 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(Worker_TP1 pid=669) DEBUG 04-22 00:15:31 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(Worker_TP0 pid=668) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=469) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=469) INFO 04-22 00:15:41 [config/compilation.py:290] Enabled custom fusions: allreduce_rms +(Worker_TP1 pid=669) DEBUG 04-22 00:15:41 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP0 pid=668) DEBUG 04-22 00:15:41 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=469) DEBUG 04-22 00:15:41 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(EngineCore pid=469) DEBUG 04-22 00:15:41 [v1/engine/core.py:1158] EngineCore waiting for work. +(EngineCore pid=469) DEBUG 04-22 00:15:41 [v1/engine/core.py:1158] EngineCore waiting for work. +(APIServer pid=1) INFO 04-22 00:15:41 [entrypoints/openai/api_server.py:590] Supported tasks: ['generate'] +(APIServer pid=1) DEBUG 04-22 00:15:42 [renderers/base.py:197] Warming up chat template processing... +(Worker_TP0 pid=668) DEBUG 04-22 00:15:42 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(Worker_TP1 pid=669) DEBUG 04-22 00:15:42 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event +(APIServer pid=1) The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. +(APIServer pid=1) INFO 04-22 00:15:43 [renderers/hf.py:314] Detected the chat template content format to be 'openai'. You can set `--chat-template-content-format` to override this. +(APIServer pid=1) DEBUG 04-22 00:15:43 [renderers/base.py:203] Chat template warmup completed in 1.712s +(APIServer pid=1) DEBUG 04-22 00:15:43 [renderers/base.py:218] Warming up multi-modal processing... +(APIServer pid=1) The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. +(APIServer pid=1) INFO 04-22 00:15:46 [renderers/base.py:231] Multi-modal warmup completed in 2.956s +(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/openai/api_server.py:594] Starting vLLM server on http://0.0.0.0:8000 +(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:37] Available routes are: +(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /openapi.json, Methods: HEAD, GET +(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /docs, Methods: HEAD, GET +(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /docs/oauth2-redirect, Methods: HEAD, GET +(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /redoc, Methods: HEAD, GET +(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /tokenize, Methods: POST +(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /detokenize, Methods: POST +(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /load, Methods: GET +(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /version, Methods: GET +(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /health, Methods: GET +(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /metrics, Methods: GET +(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /v1/models, Methods: GET +(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /ping, Methods: GET +(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /ping, Methods: POST +(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /invocations, Methods: POST +(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /v1/chat/completions, Methods: POST +(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST +(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /v1/responses, Methods: POST +(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET +(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST +(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /v1/completions, Methods: POST +(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /v1/messages, Methods: POST +(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST +(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /inference/v1/generate, Methods: POST +(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /scale_elastic_ep, Methods: POST +(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST +(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /v1/chat/completions/render, Methods: POST +(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /v1/completions/render, Methods: POST +(APIServer pid=1) INFO: Started server process [1] +(APIServer pid=1) INFO: Waiting for application startup. +(APIServer pid=1) INFO: Application startup complete. +(APIServer pid=1) DEBUG 04-22 00:15:51 [v1/engine/async_llm.py:875] Called check_health. +(APIServer pid=1) INFO: 10.129.8.2:59896 - "GET /health HTTP/1.1" 200 OK diff --git a/accuracy/results/v0.19.0/logs/w8a8-redhatai-qwen2-5-7b--h100-80gb--tp1pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/w8a8-redhatai-qwen2-5-7b--h100-80gb--tp1pp1dp1--8192.log new file mode 100644 index 00000000..b8bd2a34 --- /dev/null +++ b/accuracy/results/v0.19.0/logs/w8a8-redhatai-qwen2-5-7b--h100-80gb--tp1pp1dp1--8192.log @@ -0,0 +1,861 @@ +DEBUG 04-22 00:20:54 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:20:54 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:20:54 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:20:54 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:20:54 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:20:54 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:20:54 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:20:54 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:20:54 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:20:54 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:20:54 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:20:54 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:20:58 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +DEBUG 04-22 00:21:00 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' +DEBUG 04-22 00:21:00 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +DEBUG 04-22 00:21:00 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +DEBUG 04-22 00:21:00 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +DEBUG 04-22 00:21:00 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(APIServer pid=1) INFO 04-22 00:21:00 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:21:00 [entrypoints/utils.py:299] █ █ █▄ ▄█ +(APIServer pid=1) INFO 04-22 00:21:00 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 +(APIServer pid=1) INFO 04-22 00:21:00 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8 +(APIServer pid=1) INFO 04-22 00:21:00 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ +(APIServer pid=1) INFO 04-22 00:21:00 [entrypoints/utils.py:299] +(APIServer pid=1) INFO 04-22 00:21:00 [entrypoints/utils.py:233] non-default args: {'model_tag': 'RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8', 'model': 'RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} +(APIServer pid=1) WARNING 04-22 00:21:00 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND +(APIServer pid=1) DEBUG 04-22 00:21:01 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen2.Qwen2ForCausalLM from cache +(APIServer pid=1) DEBUG 04-22 00:21:01 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0026569 secs +(APIServer pid=1) INFO 04-22 00:21:01 [config/model.py:549] Resolved architecture: Qwen2ForCausalLM +(APIServer pid=1) INFO 04-22 00:21:01 [config/model.py:1678] Using max model len 8192 +(APIServer pid=1) DEBUG 04-22 00:21:01 [config/model.py:1743] Generative models support chunked prefill. +(APIServer pid=1) DEBUG 04-22 00:21:01 [config/model.py:1801] Generative models support prefix caching. +(APIServer pid=1) DEBUG 04-22 00:21:01 [engine/arg_utils.py:2116] Enabling chunked prefill by default +(APIServer pid=1) DEBUG 04-22 00:21:01 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. +(APIServer pid=1) DEBUG 04-22 00:21:01 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. +(APIServer pid=1) INFO 04-22 00:21:01 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. +(APIServer pid=1) INFO 04-22 00:21:01 [config/vllm.py:790] Asynchronous scheduling is enabled. +(APIServer pid=1) DEBUG 04-22 00:21:01 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. +(APIServer pid=1) DEBUG 04-22 00:21:01 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:21:02 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' +(APIServer pid=1) DEBUG 04-22 00:21:02 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model +DEBUG 04-22 00:21:05 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. +DEBUG 04-22 00:21:05 [platforms/__init__.py:37] Checking if TPU platform is available. +DEBUG 04-22 00:21:05 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' +DEBUG 04-22 00:21:05 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:21:05 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:21:05 [platforms/__init__.py:113] Checking if ROCm platform is available. +DEBUG 04-22 00:21:05 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' +DEBUG 04-22 00:21:05 [platforms/__init__.py:134] Checking if XPU platform is available. +DEBUG 04-22 00:21:05 [platforms/__init__.py:165] Checking if CPU platform is available. +DEBUG 04-22 00:21:05 [platforms/__init__.py:62] Checking if CUDA platform is available. +DEBUG 04-22 00:21:05 [platforms/__init__.py:85] Confirmed CUDA platform is available. +DEBUG 04-22 00:21:05 [platforms/__init__.py:247] Automatically detected platform cuda. +DEBUG 04-22 00:21:10 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. +(EngineCore pid=244) DEBUG 04-22 00:21:12 [v1/engine/core.py:1018] Waiting for init message from front-end. +(APIServer pid=1) DEBUG 04-22 00:21:12 [v1/engine/utils.py:1158] HELLO from local core engine process 0. +(EngineCore pid=244) DEBUG 04-22 00:21:12 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/37ef579d-35a8-4fe4-9b49-bd08c5456acc'], outputs=['ipc:///tmp/69aed467-8552-41bb-97fc-aef0d67a2515'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) +(EngineCore pid=244) DEBUG 04-22 00:21:12 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None +(EngineCore pid=244) DEBUG 04-22 00:21:12 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: +(EngineCore pid=244) DEBUG 04-22 00:21:12 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver +(EngineCore pid=244) DEBUG 04-22 00:21:12 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver +(EngineCore pid=244) DEBUG 04-22 00:21:12 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. +(EngineCore pid=244) INFO 04-22 00:21:12 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8', speculative_config=None, tokenizer='RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} +(EngineCore pid=244) DEBUG 04-22 00:21:12 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.26:38023 backend=nccl +(EngineCore pid=244) INFO 04-22 00:21:12 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.26:38023 backend=nccl +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=244) DEBUG 04-22 00:21:12 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 +(EngineCore pid=244) INFO 04-22 00:21:12 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A +(EngineCore pid=244) DEBUG 04-22 00:21:13 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776817273.1488597, auto_measure=True +(EngineCore pid=244) DEBUG 04-22 00:21:13 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB +(EngineCore pid=244) DEBUG 04-22 00:21:13 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=244) DEBUG 04-22 00:21:13 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] +(EngineCore pid=244) DEBUG 04-22 00:21:13 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] +(EngineCore pid=244) DEBUG 04-22 00:21:13 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] +(EngineCore pid=244) DEBUG 04-22 00:21:13 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. +(EngineCore pid=244) DEBUG 04-22 00:21:13 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). +(EngineCore pid=244) DEBUG 04-22 00:21:13 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). +(EngineCore pid=244) INFO 04-22 00:21:13 [v1/worker/gpu_model_runner.py:4735] Starting to load model RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8... +(EngineCore pid=244) DEBUG 04-22 00:21:13 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.0.self_attn.qkv_proj +(EngineCore pid=244) INFO 04-22 00:21:13 [model_executor/.../linear/__init__.py:291] Selected CutlassInt8ScaledMMLinearKernel for CompressedTensorsW8A8Int8 +(EngineCore pid=244) DEBUG 04-22 00:21:13 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.0.self_attn.o_proj +(EngineCore pid=244) DEBUG 04-22 00:21:13 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. +(EngineCore pid=244) INFO 04-22 00:21:13 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. +(EngineCore pid=244) INFO 04-22 00:21:13 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 +(EngineCore pid=244) DEBUG 04-22 00:21:13 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.0.mlp.gate_up_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.0.mlp.down_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.1.self_attn.qkv_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.1.self_attn.o_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.1.mlp.gate_up_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.1.mlp.down_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.2.self_attn.qkv_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.2.self_attn.o_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.2.mlp.gate_up_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.2.mlp.down_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.3.self_attn.qkv_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.3.self_attn.o_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.3.mlp.gate_up_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.3.mlp.down_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.4.self_attn.qkv_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.4.self_attn.o_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.4.mlp.gate_up_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.4.mlp.down_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.5.self_attn.qkv_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.5.self_attn.o_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.5.mlp.gate_up_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.5.mlp.down_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.6.self_attn.qkv_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.6.self_attn.o_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.6.mlp.gate_up_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.6.mlp.down_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.7.self_attn.qkv_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.7.self_attn.o_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.7.mlp.gate_up_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.7.mlp.down_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.8.self_attn.qkv_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.8.self_attn.o_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.8.mlp.gate_up_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.8.mlp.down_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.9.self_attn.qkv_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.9.self_attn.o_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.9.mlp.gate_up_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.9.mlp.down_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.10.self_attn.qkv_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.10.self_attn.o_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.10.mlp.gate_up_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.10.mlp.down_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.11.self_attn.qkv_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.11.self_attn.o_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.11.mlp.gate_up_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.11.mlp.down_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.12.self_attn.qkv_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.12.self_attn.o_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.12.mlp.gate_up_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.12.mlp.down_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.13.self_attn.qkv_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.13.self_attn.o_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.13.mlp.gate_up_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.13.mlp.down_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.14.self_attn.qkv_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.14.self_attn.o_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.14.mlp.gate_up_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.14.mlp.down_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.15.self_attn.qkv_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.15.self_attn.o_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.15.mlp.gate_up_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.15.mlp.down_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.16.self_attn.qkv_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.16.self_attn.o_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.16.mlp.gate_up_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.16.mlp.down_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.17.self_attn.qkv_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.17.self_attn.o_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.17.mlp.gate_up_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.17.mlp.down_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.18.self_attn.qkv_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.18.self_attn.o_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.18.mlp.gate_up_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.18.mlp.down_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.19.self_attn.qkv_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.19.self_attn.o_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.19.mlp.gate_up_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.19.mlp.down_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.20.self_attn.qkv_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.20.self_attn.o_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.20.mlp.gate_up_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.20.mlp.down_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.21.self_attn.qkv_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.21.self_attn.o_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.21.mlp.gate_up_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.21.mlp.down_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.22.self_attn.qkv_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.22.self_attn.o_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.22.mlp.gate_up_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.22.mlp.down_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.23.self_attn.qkv_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.23.self_attn.o_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.23.mlp.gate_up_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.23.mlp.down_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.24.self_attn.qkv_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.24.self_attn.o_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.24.mlp.gate_up_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.24.mlp.down_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.25.self_attn.qkv_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.25.self_attn.o_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.25.mlp.gate_up_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.25.mlp.down_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.26.self_attn.qkv_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.26.self_attn.o_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.26.mlp.gate_up_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.26.mlp.down_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.27.self_attn.qkv_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.27.self_attn.o_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.27.mlp.gate_up_proj +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.27.mlp.down_proj +(EngineCore pid=244) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. +(EngineCore pid=244) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. +(EngineCore pid=244) DEBUG 04-22 00:21:14 [compilation/backends.py:101] Using InductorStandaloneAdaptor +(EngineCore pid=244) DEBUG 04-22 00:21:14 [config/compilation.py:1194] enabled custom ops: Counter() +(EngineCore pid=244) DEBUG 04-22 00:21:14 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 57, 'silu_and_mul': 28, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... +(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00002.safetensors', 'model-00001-of-00002.safetensors']] +(EngineCore pid=244) Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00 +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] Traced files (to be considered for compilation cache): +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py +(EngineCore pid=244) INFO 04-22 00:21:28 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/e6437cffcc/rank_0_0/backbone for vLLM's torch.compile +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=dcef09dcfb comp=e546579c48 code=7251f1a70adc678d269098238fd40a04d778c50abe51e8dae45473a028705e7a dir=/data/.cache/vllm/torch_compile_cache/e6437cffcc/rank_0_0/backbone +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] Compile env factors (raw): +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'CUDA_HOME': None, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'NVCC_THREADS': None, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VERBOSE': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_API_KEY': None, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] Vllm config hash: dcef09dcfb +(EngineCore pid=244) INFO 04-22 00:21:28 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.90 s +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] +(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] +(EngineCore pid=244) DEBUG 04-22 00:21:29 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=244) DEBUG 04-22 00:21:29 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms +(EngineCore pid=244) DEBUG 04-22 00:21:29 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms +(EngineCore pid=244) DEBUG 04-22 00:21:29 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=244) DEBUG 04-22 00:21:29 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(EngineCore pid=244) INFO 04-22 00:21:30 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use +(EngineCore pid=244) DEBUG 04-22 00:21:30 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/e6437cffcc/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') +(EngineCore pid=244) DEBUG 04-22 00:21:31 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=244) DEBUG 04-22 00:21:31 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms +(EngineCore pid=244) DEBUG 04-22 00:21:31 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.5 ms +(EngineCore pid=244) DEBUG 04-22 00:21:31 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=244) DEBUG 04-22 00:21:31 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms +(APIServer pid=1) DEBUG 04-22 00:21:32 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=244) DEBUG 04-22 00:21:33 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/e6437cffcc/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') +(EngineCore pid=244) DEBUG 04-22 00:21:34 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices +(EngineCore pid=244) DEBUG 04-22 00:21:34 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.1 ms +(EngineCore pid=244) DEBUG 04-22 00:21:34 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms +(EngineCore pid=244) DEBUG 04-22 00:21:34 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes +(EngineCore pid=244) DEBUG 04-22 00:21:34 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms +(EngineCore pid=244) DEBUG 04-22 00:21:34 [compilation/backends.py:377] Store the 28-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_28', '/data/.cache/vllm/torch_compile_cache/e6437cffcc/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_28') +(EngineCore pid=244) INFO 04-22 00:21:34 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 6.31 s +(EngineCore pid=244) DEBUG 04-22 00:21:34 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/e6437cffcc/rank_0_0/backbone/computation_graph.py +(EngineCore pid=244) INFO 04-22 00:21:35 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/2034ef402a31dd9ff23fe79a0a9c284842df5a9a5b69f98bb3d96953f0298379/rank_0_0/model +(EngineCore pid=244) INFO 04-22 00:21:35 [compilation/monitor.py:48] torch.compile took 12.49 s in total +(EngineCore pid=244) INFO 04-22 00:21:36 [compilation/monitor.py:76] Initial profiling/warmup run took 0.31 s +(EngineCore pid=244) INFO 04-22 00:21:41 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 +(EngineCore pid=244) DEBUG 04-22 00:21:41 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling +(EngineCore pid=244) INFO 04-22 00:21:41 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) +(EngineCore pid=244) DEBUG 04-22 00:21:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:21:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:21:42 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-22 00:21:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:21:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:21:42 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) +(APIServer pid=1) DEBUG 04-22 00:21:42 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. +(EngineCore pid=244) DEBUG 04-22 00:21:42 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 128.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=244) DEBUG 04-22 00:21:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:21:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:21:42 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-22 00:21:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:21:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) DEBUG 04-22 00:21:42 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) +(EngineCore pid=244) DEBUG 04-22 00:21:42 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 228.00 MiB first-capture + (51-1) × 6.00 MiB per-graph +(EngineCore pid=244) DEBUG 04-22 00:21:42 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs +(EngineCore pid=244) INFO 04-22 00:21:42 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.81 GiB total +(EngineCore pid=244) DEBUG 04-22 00:21:43 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB +(EngineCore pid=244) DEBUG 04-22 00:21:43 [v1/worker/gpu_worker.py:430] Free memory after profiling: 69.79 GiB (total), 66.34 GiB (within requested) +(EngineCore pid=244) DEBUG 04-22 00:21:43 [v1/worker/gpu_worker.py:435] Memory profiling takes 19.77 seconds. Total non KV cache memory: 10.59GiB; torch peak memory increase: 2.21GiB; non-torch forward increase memory: 0.25GiB; weights memory: 8.14GiB. +(EngineCore pid=244) INFO 04-22 00:21:43 [v1/worker/gpu_worker.py:436] Available KV cache memory: 64.64 GiB +(EngineCore pid=244) INFO 04-22 00:21:43 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9602 to maintain the same effective KV cache size. +(EngineCore pid=244) INFO 04-22 00:21:43 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 1,210,272 tokens +(EngineCore pid=244) INFO 04-22 00:21:43 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 147.74x +(EngineCore pid=244) 2026-04-22 00:21:43,131 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... +(EngineCore pid=244) DEBUG 04-22 00:21:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None +(EngineCore pid=244) 2026-04-22 00:21:43,141 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends +(EngineCore pid=244) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 Date: Wed, 22 Apr 2026 22:37:54 -0400 Subject: [PATCH 11/24] Draft blog Signed-off-by: Jing Chen --- accuracy/blog-gpu-capacity.md | 90 +++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 accuracy/blog-gpu-capacity.md diff --git a/accuracy/blog-gpu-capacity.md b/accuracy/blog-gpu-capacity.md new file mode 100644 index 00000000..96207324 --- /dev/null +++ b/accuracy/blog-gpu-capacity.md @@ -0,0 +1,90 @@ +# 57 Experiments Later: What We Learned About LLM Memory Prediction + +*GPU memory planning for LLM deployments is still mostly guesswork. Here's what we learned from measuring it empirically across 34 architectures.* + +--- + +You're standing up a benchmark suite and need to know how many GPUs each model configuration requires before sizing the cluster. Or you're launching a serving application and want to plan capacity without over-provisioning by 3x. Or you're a researcher asking whether two H100s will be enough for a 70B model, or whether you need four. + +In all of these cases, the question is the same: **how much GPU memory will this actually need?** + +Most teams answer it by copying what someone else deployed, or by spinning up the pod, watching it OOM, and doubling the resources. This works, but it gets harder as models grow larger and serving configurations more complex. Tensor parallelism, pipeline parallelism, quantization, and long-context windows all change the memory footprint in non-obvious ways. + +[llm-d-planner](https://github.com/llm-d-incubation/llm-d-planner) is an open-source capacity planning library built to answer this question before you deploy. To make sure it wasn't just replacing guesswork with a false sense of precision, we ran 57 experiments on H100 GPUs to validate its predictions against reality. Here's what we found, and why we're asking the community to help make it even better. + +--- + +## What llm-d-planner Does + +llm-d-planner is a pip-installable Python library for LLM capacity planning. You give it a model and a serving configuration and it predicts GPU memory consumption and max concurrency across four components: weights, KV cache, activation memory, and non-torch overhead (CUDA runtime and NCCL buffers for multi-GPU). Each scales differently with tensor parallelism, context length, and quantization, so knowing which component is driving your footprint tells you what to actually change. For each component, the planner anchors to a source of truth wherever one exists: `config.json` and safetensor file headers for weights, vLLM's allocation strategy for KV cache, and empirically measured constants for things that can't be derived analytically, like activation memory. The experiment in this post is how those constants are kept honest. + +--- + +## The Experiment: Trusting but Verifying + +Claiming a tool is accurate is easy. Measuring it is harder. We launched vLLM servers across 57 configurations on H100-80GB GPUs, captured the full startup logs for each, and parsed the actual memory measurements reported by vLLM at initialization. We then compared those measurements against llm-d-planner's predictions for every configuration. The sweep covered: + +- **34 model architectures**: Llama, Qwen, Gemma, Granite, Mistral, DeepSeek, Phi, Mixtral, and multimodal models including LLaVA and Kimi-VL +- **Tensor parallelism** (TP 1, 2, 4) and **pipeline parallelism** (PP 1, 2, 4) +- **Context lengths** from 2,048 to 32,768 tokens +- **Dtype and quantization variants**: bfloat16, float16; compressed-tensors, GPTQ +- **vLLM version sensitivity**: Qwen3-14B across v0.15.0 through v0.19.0 to track how memory behavior changes across releases + +For each run, we compared predicted values against the four measured memory components independently. The raw results and all run JSON files are committed to the repository, and the analysis is fully reproducible locally without cluster access. + +--- + +## What We Found + +### The headline: accurate where it counts most + +**Weight memory: 0.89% mean absolute error** across 53 of the 57 runs. (The remaining 4 used parameters the planner doesn't yet model, float32 dtype and runtime fp8 quantization, and are discussed below.) This is the single largest memory component; for a model like Llama-3.1-8B at TP=1, weights consume about 15 GiB of the 79 GiB available. It's also the hardest to get right across a diverse model set. + +Weight prediction is harder than it looks: dense, MoE, multi-head latent attention, and vision-language models all organize parameters differently, quantization changes the bytes-per-parameter, and TP sharding depends on how dimensions divide across ranks. The formula handles all of this by reading `config.json` for architecture parameters and safetensor headers for exact tensor shapes, giving precise counts without downloading the full model and making it generalizable to any model on HuggingFace beyond the 34 we explicitly tested. Across dense, MoE, multimodal, and quantized architectures, it held to under 1% error. + +**KV cache memory: 0.34% mean error** across all runs. This is the component that matters most for capacity planning, as it determines your maximum concurrent token budget. For Llama-3.1-8B at TP=1 with 8K context, that's roughly 58 GiB of KV pool, and we're within half a GiB across every context length we tested. + +One insight worth pausing on: the KV pool size is *independent* of `max_model_len`. vLLM sizes the pool from whatever memory remains after weights and activation are allocated, then figures out how many tokens fit given the per-token KV size for that architecture. This means setting a longer context window doesn't shrink your KV pool; it just means each token occupies more of it. Tools that pre-allocate based on `max_model_len` will over-estimate memory for long-context configs and leave capacity on the table. + +These two components together typically account for 90%+ of total GPU memory consumption. Getting them right is what makes the planner useful in practice. + +### The honest part: smaller components, real errors + +**Activation memory** showed a mean error of +195%. That sounds alarming, so let's ground it in absolute numbers. For Llama-3.1-8B at TP=1, our formula predicted 4.80 GiB; vLLM v0.19.0 actually used 1.89 GiB, an over-estimate of about 2.9 GiB. On a GPU with 79 GiB of VRAM where weights alone consume 15 GiB and the KV pool takes 58 GiB, a 2.9 GiB error in a smaller component is meaningful but bounded. + +The root cause is more interesting than the magnitude: **vLLM v0.17.0 quietly reduced activation memory by ~60%, and we didn't notice.** + +Our version sensitivity study tells the story clearly: + +| vLLM version | Activation (Qwen3-14B) | +|:---:|:---:| +| v0.15.0 | 5.64 GiB | +| v0.16.0 | 5.64 GiB | +| **v0.17.0** | **2.23 GiB** | +| v0.18.0 | 2.23 GiB | +| v0.19.0 | ~2.21 GiB | + +The planner's Qwen3 activation constant was 5.60 GiB, a near-exact match for v0.16.0. Our constants had been calibrated against an older vLLM release and were never updated as vLLM evolved. The 60% reduction at v0.17.0 freed memory that vLLM reallocated to the KV cache, actually *improving* serving capacity, but our planner didn't know about it. + +This kind of silent drift is precisely why empirical validation matters. We didn't catch it until we ran the experiments, and the fix was straightforward once we knew where to look: re-calibrate every architecture constant against v0.19.0 measurements. That's now done, and the updated constants are in the library. + +**Non-torch overhead** (CUDA runtime + NCCL buffers) was under-estimated by 44% on average. At TP=1, this is a small absolute amount (~0.25 GiB actual vs 0.15 GiB predicted). At TP>=2, NCCL all-reduce buffers push actual overhead to ~2.1 GiB per GPU versus our constant of 0.60 GiB, a more meaningful gap. Updated multi-GPU constants are also in. + +There are a few configurations the experiment didn't cover that the planner doesn't yet model: fp8 KV cache dtype (halves per-token storage, roughly doubling token capacity), float32 dtype overrides (doubles weight memory), runtime fp8 quantization, and data parallelism. These are real gaps for anyone running quantized production models today, and they're actively being worked on — contributions are welcome if you need one of these sooner. The sweep also turned up a subtle correctness bug in `find_possible_tp`: it wasn't verifying that TP values divide `vocab_size`, which can cause vLLM to reject a configuration the planner suggests as valid. That's fixed. + +--- + +## Join the Community + +We covered 34 architectures. The LLM landscape releases more every week, and vLLM will keep evolving. Accuracy at a point in time isn't enough; what matters is having a community that keeps the constants current as things change. + +**If your model isn't covered, or a new architecture ships with memory optimizations** (a new attention variant, a custom KV cache layout, or a novel quantization scheme), llm-d-planner should be where those updated constants land first. The sweep runner in `accuracy/` is fully documented and self-contained; run it against your own cluster, submit the results as a PR, and everyone who installs the library gets the improvement. + +**Get started:** + +- [GitHub: llm-d-incubation/llm-d-planner](https://github.com/llm-d-incubation/llm-d-planner) +- [Accuracy campaign results and methodology](https://github.com/llm-d-incubation/llm-d-planner/tree/main/accuracy) +- [Run the sweep on your own cluster](https://github.com/llm-d-incubation/llm-d-planner/blob/main/accuracy/README.md) +- Open an issue or PR; contributions welcome + +No one should have to guess how many GPUs they need. From a95ae283ce9845996f821747135f908e3f22d534 Mon Sep 17 00:00:00 2001 From: Jing Chen Date: Thu, 23 Apr 2026 15:02:41 -0400 Subject: [PATCH 12/24] Shorten summary Signed-off-by: Jing Chen --- accuracy/blog-gpu-capacity.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/accuracy/blog-gpu-capacity.md b/accuracy/blog-gpu-capacity.md index 96207324..984b070c 100644 --- a/accuracy/blog-gpu-capacity.md +++ b/accuracy/blog-gpu-capacity.md @@ -10,13 +10,15 @@ In all of these cases, the question is the same: **how much GPU memory will this Most teams answer it by copying what someone else deployed, or by spinning up the pod, watching it OOM, and doubling the resources. This works, but it gets harder as models grow larger and serving configurations more complex. Tensor parallelism, pipeline parallelism, quantization, and long-context windows all change the memory footprint in non-obvious ways. -[llm-d-planner](https://github.com/llm-d-incubation/llm-d-planner) is an open-source capacity planning library built to answer this question before you deploy. To make sure it wasn't just replacing guesswork with a false sense of precision, we ran 57 experiments on H100 GPUs to validate its predictions against reality. Here's what we found, and why we're asking the community to help make it even better. +[llm-d-planner](https://github.com/llm-d-incubation/llm-d-planner) is an open-source library that guides LLM deployments from concept to production. One of its core submodules is a capacity planner built to answer this question before you deploy. To make sure it wasn't just replacing guesswork with a false sense of precision, we ran 57 experiments on H100 GPUs to validate its predictions against reality. Here's what we found, and why we're asking the community to help make it even better. --- ## What llm-d-planner Does -llm-d-planner is a pip-installable Python library for LLM capacity planning. You give it a model and a serving configuration and it predicts GPU memory consumption and max concurrency across four components: weights, KV cache, activation memory, and non-torch overhead (CUDA runtime and NCCL buffers for multi-GPU). Each scales differently with tensor parallelism, context length, and quantization, so knowing which component is driving your footprint tells you what to actually change. For each component, the planner anchors to a source of truth wherever one exists: `config.json` and safetensor file headers for weights, vLLM's allocation strategy for KV cache, and empirically measured constants for things that can't be derived analytically, like activation memory. The experiment in this post is how those constants are kept honest. +llm-d-planner guides LLM deployments from concept to production: conversational requirements gathering, SLO-driven model and GPU recommendations, what-if analysis, one-click Kubernetes config generation, and monitoring. The capacity planner is a pip-installable subcomponent that focuses on one question: how much GPU memory will this deployment actually need. + +It breaks memory into four components: weights, KV cache, activation memory, and non-torch overhead (CUDA runtime and NCCL buffers for multi-GPU). Each scales differently with tensor parallelism, context length, and quantization, so knowing which component is driving your footprint tells you what to actually change. For each component, the planner anchors to a source of truth wherever one exists: `config.json` and safetensor file headers for weights, vLLM's allocation strategy for KV cache, and empirically measured constants for things that can't be derived analytically, like activation memory. The experiment in this post is how those constants are kept honest. --- From 56dc5b76ca535960998bdf9fd4a3077d98fdf60a Mon Sep 17 00:00:00 2001 From: Jing Chen Date: Thu, 23 Apr 2026 20:28:33 -0400 Subject: [PATCH 13/24] rm temp files Signed-off-by: Jing Chen --- accuracy/k8s/configmap-sweep-versions.yaml | 15 + accuracy/k8s/orchestrator-job-gemma.yaml | 46 --- ...ma.yaml => orchestrator-job-versions.yaml} | 14 +- accuracy/scripts/sweep-codellama.yaml | 21 -- accuracy/scripts/sweep-gemma.yaml | 35 -- accuracy/scripts/sweep-versions.yaml | 46 +++ accuracy/scripts/sweep.yaml | 339 ++++++++---------- 7 files changed, 209 insertions(+), 307 deletions(-) create mode 100644 accuracy/k8s/configmap-sweep-versions.yaml delete mode 100644 accuracy/k8s/orchestrator-job-gemma.yaml rename accuracy/k8s/{orchestrator-job-codellama.yaml => orchestrator-job-versions.yaml} (71%) delete mode 100644 accuracy/scripts/sweep-codellama.yaml delete mode 100644 accuracy/scripts/sweep-gemma.yaml create mode 100644 accuracy/scripts/sweep-versions.yaml diff --git a/accuracy/k8s/configmap-sweep-versions.yaml b/accuracy/k8s/configmap-sweep-versions.yaml new file mode 100644 index 00000000..8cb918c7 --- /dev/null +++ b/accuracy/k8s/configmap-sweep-versions.yaml @@ -0,0 +1,15 @@ +# accuracy/k8s/configmap-sweep-versions.yaml +# Generated from accuracy/scripts/sweep-versions.yaml. +# Sync with: kubectl create configmap vllm-mem-sweep-versions \ +# --from-file=sweep-versions.yaml=accuracy/scripts/sweep-versions.yaml \ +# --namespace llmdplanner --dry-run=client -o yaml \ +# | kubectl apply -f - +apiVersion: v1 +kind: ConfigMap +metadata: + name: vllm-mem-sweep-versions + namespace: llmdplanner +data: + sweep-versions.yaml: | + # Contents of accuracy/scripts/sweep-versions.yaml go here. + # Use the kubectl command above to generate and keep in sync. diff --git a/accuracy/k8s/orchestrator-job-gemma.yaml b/accuracy/k8s/orchestrator-job-gemma.yaml deleted file mode 100644 index acc7f303..00000000 --- a/accuracy/k8s/orchestrator-job-gemma.yaml +++ /dev/null @@ -1,46 +0,0 @@ -# Submit with: kubectl apply -f accuracy/k8s/orchestrator-job-gemma.yaml -# Monitor with: kubectl logs -f job/vllm-mem-orchestrator-gemma -n llmdplanner -apiVersion: batch/v1 -kind: Job -metadata: - name: vllm-mem-orchestrator-gemma - namespace: llmdplanner -spec: - backoffLimit: 0 - activeDeadlineSeconds: 28800 # 8-hour cap for 6-model run - template: - spec: - serviceAccountName: vllm-mem-orchestrator - restartPolicy: Never - volumes: - - name: data - persistentVolumeClaim: - claimName: vllm-mem-data - - name: scripts - configMap: - name: vllm-mem-scripts - defaultMode: 0755 - - name: sweep - configMap: - name: vllm-mem-sweep-gemma - containers: - - name: orchestrator - image: python:3.11-slim - command: ["/bin/bash", "-c"] - args: - - | - pip install pyyaml kubernetes --quiet --no-cache-dir && - python /scripts/sweep_runner.py \ - --config /sweep/sweep-gemma.yaml \ - --results /data/results/ - volumeMounts: - - name: data - mountPath: /data - - name: scripts - mountPath: /scripts - - name: sweep - mountPath: /sweep - resources: - requests: - cpu: "500m" - memory: "512Mi" diff --git a/accuracy/k8s/orchestrator-job-codellama.yaml b/accuracy/k8s/orchestrator-job-versions.yaml similarity index 71% rename from accuracy/k8s/orchestrator-job-codellama.yaml rename to accuracy/k8s/orchestrator-job-versions.yaml index c6b8917b..0cbefe9c 100644 --- a/accuracy/k8s/orchestrator-job-codellama.yaml +++ b/accuracy/k8s/orchestrator-job-versions.yaml @@ -1,13 +1,13 @@ -# Submit with: kubectl apply -f accuracy/k8s/orchestrator-job-codellama.yaml -# Monitor with: kubectl logs -f job/vllm-mem-orchestrator-codellama -n llmdplanner +# Submit with: kubectl apply -f accuracy/k8s/orchestrator-job-versions.yaml +# Monitor with: kubectl logs -f job/vllm-mem-orchestrator-versions -n llmdplanner apiVersion: batch/v1 kind: Job metadata: - name: vllm-mem-orchestrator-codellama + name: vllm-mem-orchestrator-versions namespace: llmdplanner spec: backoffLimit: 0 - activeDeadlineSeconds: 7200 # 2-hour cap for 2-model run + activeDeadlineSeconds: 14400 # 4-hour cap for 4-run sweep template: spec: serviceAccountName: vllm-mem-orchestrator @@ -22,7 +22,7 @@ spec: defaultMode: 0755 - name: sweep configMap: - name: vllm-mem-sweep-codellama + name: vllm-mem-sweep-versions containers: - name: orchestrator image: python:3.11-slim @@ -31,8 +31,8 @@ spec: - | pip install pyyaml kubernetes --quiet --no-cache-dir && python /scripts/sweep_runner.py \ - --config /sweep/sweep-codellama.yaml \ - --results /data/results/ + --config /sweep/sweep-versions.yaml \ + --results /data/results/version-sweep volumeMounts: - name: data mountPath: /data diff --git a/accuracy/scripts/sweep-codellama.yaml b/accuracy/scripts/sweep-codellama.yaml deleted file mode 100644 index efb4eb77..00000000 --- a/accuracy/scripts/sweep-codellama.yaml +++ /dev/null @@ -1,21 +0,0 @@ -defaults: - gpu: H100-80GB - gpu_memory_utilization: "0.95" - max_model_len: 8192 - pp: 1 - dp: 1 - dtype: auto - kv_cache_dtype: auto - quantization: null - vllm_image: vllm/vllm-openai:v0.19.0 - namespace: llmdplanner - results_pvc: vllm-mem-data - node_selector: - nvidia.com/gpu.product: NVIDIA-H100-80GB-HBM3 - -runs: - - model: codellama/CodeLlama-7b-hf # 7B dense; LlamaForCausalLM architecture - tp: 1 - - - model: codellama/CodeLlama-34b-hf # 34B dense; ~65 GiB bf16 - tp: 2 diff --git a/accuracy/scripts/sweep-gemma.yaml b/accuracy/scripts/sweep-gemma.yaml deleted file mode 100644 index 2d2df8d9..00000000 --- a/accuracy/scripts/sweep-gemma.yaml +++ /dev/null @@ -1,35 +0,0 @@ -defaults: - gpu: H100-80GB - gpu_memory_utilization: "0.95" - max_model_len: 8192 - pp: 1 - dp: 1 - dtype: auto - kv_cache_dtype: auto - quantization: null - vllm_image: vllm/vllm-openai:v0.19.0 - namespace: llmdplanner - results_pvc: vllm-mem-data - hf_token_secret: hf-token-gemma - node_selector: - nvidia.com/gpu.product: NVIDIA-H100-80GB-HBM3 - kubernetes.io/hostname: pokprod-b93r38s0 # only node with confirmed internet egress - -runs: - - model: google/gemma-2-2b-it # 2.6B dense; Gemma2 architecture - tp: 1 - - - model: google/gemma-2-9b-it # 9.2B dense; Gemma2 architecture - tp: 1 - - - model: google/gemma-2-27b-it # 27.2B dense; ~54 GiB bf16 - tp: 1 - - - model: google/gemma-3-4b-it # 4B dense; Gemma3 architecture - tp: 1 - - - model: google/gemma-3-12b-it # 12B dense; Gemma3 architecture - tp: 1 - - - model: google/gemma-3-27b-it # 27B dense; Gemma3 architecture - tp: 1 diff --git a/accuracy/scripts/sweep-versions.yaml b/accuracy/scripts/sweep-versions.yaml new file mode 100644 index 00000000..9d30e9dc --- /dev/null +++ b/accuracy/scripts/sweep-versions.yaml @@ -0,0 +1,46 @@ +# vLLM version sensitivity sweep — activation memory across releases. +# Goal: measure how activation memory reported by vLLM changes across versions +# to validate (or recalibrate) the planner's per-architecture activation constants. +# Model: Qwen/Qwen3-14B — single H100 (tp=1, ~28 GiB bf16 weights). +# Note: tp=5 is invalid for this model (vocab_size=151936 not divisible by 5). +# +# Results land in results/v/; update the corresponding results_pvc or +# output path in your job template to keep versions separated. +# +# Run with: --sweep sweep-versions.yaml + +defaults: + gpu: H100-80GB + gpu_memory_utilization: "0.95" + max_model_len: 8192 + tp: 1 + pp: 1 + dp: 1 + dtype: auto + kv_cache_dtype: auto + quantization: null + namespace: llmdplanner + results_pvc: vllm-mem-data + node_selector: + nvidia.com/gpu.product: NVIDIA-H100-80GB-HBM3 + +runs: + - model: Qwen/Qwen3-14B + vllm_image: vllm/vllm-openai:v0.15.0 + _label: qwen3-14b-vllm-v0.15.0 + _sweep_dim: vllm_version + + - model: Qwen/Qwen3-14B + vllm_image: vllm/vllm-openai:v0.16.0 + _label: qwen3-14b-vllm-v0.16.0 + _sweep_dim: vllm_version + + - model: Qwen/Qwen3-14B + vllm_image: vllm/vllm-openai:v0.17.0 + _label: qwen3-14b-vllm-v0.17.0 + _sweep_dim: vllm_version + + - model: Qwen/Qwen3-14B + vllm_image: vllm/vllm-openai:v0.18.0 + _label: qwen3-14b-vllm-v0.18.0 + _sweep_dim: vllm_version diff --git a/accuracy/scripts/sweep.yaml b/accuracy/scripts/sweep.yaml index 3424bcfa..64570531 100644 --- a/accuracy/scripts/sweep.yaml +++ b/accuracy/scripts/sweep.yaml @@ -1,6 +1,10 @@ -# Canonical run matrix for the vLLM memory validation campaign. +# Canonical run matrix — vLLM v0.19.0 / H100-80GB +# 57 successful runs across 34 models. See accuracy/accuracy_report.md for results. +# Version sensitivity (Part 2 — Qwen3-14B across v0.15.0–v0.18.0): see sweep-versions.yaml. +# # Edit defaults.node_selector to match your cluster's GPU node label. # Run `kubectl get nodes --show-labels` to find the right label. + defaults: gpu: H100-80GB gpu_memory_utilization: "0.95" @@ -22,270 +26,209 @@ defaults: runs: # ── Core model coverage ─────────────────────────────────────────────────── - # One run per model at the minimum feasible TP. TP sensitivity is captured - # separately in the "Argument sensitivity: tensor parallelism" section below. - - # DONE: moonshotai/Kimi-VL-A3B-Instruct tp=1 - # - model: moonshotai/Kimi-VL-A3B-Instruct # 16B total, 3B active MoE, vision-language - # tp: 1 - # trust_remote_code: true - - # DONE: moonshotai/Kimi-Dev-72B tp=2 - # - model: moonshotai/Kimi-Dev-72B # 72B dense, Qwen2 architecture - # tp: 2 # tp=1 OOM: ~144 GiB weights exceed single H100 80GB - # trust_remote_code: true + # One run per model at the minimum feasible TP (baseline: tp=pp=dp=1, len=8192, bf16). + # Llama-3.1-8B-Instruct and Qwen2.5-7B-Instruct baselines are in the TP section below. - model: codellama/CodeLlama-7b-hf # 7B dense; LlamaForCausalLM architecture tp: 1 + # codellama/CodeLlama-34b-hf tp=2 failed: GPU contention at runtime - - model: codellama/CodeLlama-34b-hf # 34B dense; ~65 GiB bf16; tp=1 OOM risk, using tp=2 - tp: 2 - - - model: deepseek-ai/DeepSeek-V2-Lite-Chat # 16B total, 2.4B active MoE; DeepSeekV2 arch + - model: deepseek-ai/DeepSeek-V2-Lite-Chat # 16B total, 2.4B active MoE; DeepSeekV2 architecture tp: 1 - - model: ibm-granite/granite-3.1-2b-instruct + - model: google/gemma-7b # 7B dense; Gemma architecture (gated) tp: 1 - _label: granite-3-1-2b + hf_token_secret: hf-token-gemma - - model: ibm-granite/granite-3.1-8b-instruct + - model: google/gemma-2-2b-it # 2.6B dense; Gemma2 architecture tp: 1 - _label: granite-3-1-8b + hf_token_secret: hf-token-gemma - - model: ibm-granite/granite-3.3-8b-instruct + - model: google/gemma-2-9b-it # 9.2B dense; Gemma2 architecture tp: 1 + hf_token_secret: hf-token-gemma - - model: ibm-granite/granite-vision-3.3-2b # vision-language; GraniteSpeechEncoderModel arch + - model: google/gemma-2-27b-it # 27.2B dense; ~54 GiB bf16; fits H100 80GB at tp=1 tp: 1 + hf_token_secret: hf-token-gemma - - model: microsoft/phi-4 # 14B dense; Phi3 architecture + - model: google/gemma-3-4b-it # 4B dense; Gemma3 architecture tp: 1 + hf_token_secret: hf-token-gemma - - model: mistralai/Mistral-Small-3.1-24B-Instruct-2503 # 24B dense; Mistral3 architecture + - model: google/gemma-3-12b-it # 12B dense; Gemma3 architecture tp: 1 + hf_token_secret: hf-token-gemma - - model: mistralai/Mixtral-8x7B-Instruct-v0.1 # 56B total, 14B active MoE - tp: 2 # tp=1 OOM: ~87 GiB weights exceed single H100 80GB + - model: google/gemma-3-27b-it # 27B dense; Gemma3 architecture + tp: 1 + hf_token_secret: hf-token-gemma + # google/gemma-4-E4B-it unsupported: Gemma4 arch not in transformers bundled with vLLM v0.19.0 - - model: openai/gpt-oss-20b # 20B dense; sampler warmup needs ~786 MiB/GPU beyond KV; gmu=0.90 leaves headroom - tp: 2 - gpu_memory_utilization: "0.90" - # gpt-oss-120b skipped — OOM or infra unavailability at all tested tp values + - model: ibm-granite/granite-3.1-2b-instruct + tp: 1 + _label: granite-3-1-2b - - model: microsoft/phi-2 # 2.7B dense; Phi architecture; max ctx = 2048 + - model: ibm-granite/granite-3.1-8b-instruct tp: 1 - max_model_len: 2048 - - model: google/gemma-7b # 7B dense; Gemma architecture (gated) + - model: ibm-granite/granite-3.3-8b-instruct tp: 1 - hf_token_secret: hf-token-gemma - - model: Qwen/Qwen2.5-7B-Instruct # 7B dense; reference model for sensitivity sweeps + - model: ibm-granite/granite-vision-3.3-2b # vision-language; LlavaNext architecture tp: 1 - - model: Qwen/Qwen2.5-72B-Instruct # 72B dense - tp: 2 # tp=1 OOM: ~144 GiB weights exceed single H100 80GB + - model: meta-llama/Llama-4-Scout-17B-16E-Instruct # 109B total, 17B active MoE (16 experts); tp=1 OOM (~212 GiB), tp=2 OOM (~106 GiB) + tp: 4 - - model: Qwen/Qwen3-8B # 8B dense; Qwen3 architecture + - model: microsoft/phi-2 # 2.7B dense; Phi architecture; max_position_embeddings=2048 tp: 1 + max_model_len: 2048 - - model: Qwen/Qwen3-30B-A3B # 30B total, 3B active MoE; Qwen3Moe architecture + - model: microsoft/phi-4 # 14B dense; Phi3 architecture tp: 1 - - model: meta-llama/Llama-4-Scout-17B-16E-Instruct # 109B total, 17B active MoE (16 experts) - tp: 4 # tp=1 OOM (~212 GiB total), tp=2 OOM (~106 GiB total) - - # DONE: Qwen/Qwen1.5-MoE-A2.7B tp=1 - # - model: Qwen/Qwen1.5-MoE-A2.7B # 14.3B total, 2.7B active MoE; Qwen2Moe architecture - # tp: 1 + - model: mistralai/Mistral-Small-3.1-24B-Instruct-2503 # 24B dense; Mistral3 architecture + tp: 1 - # UNSUPPORTED: gemma4 arch not in transformers bundled with vLLM v0.19.0 - # - model: google/gemma-4-E4B-it # MoE; Gemma4 architecture (gated) - # tp: 1 - # hf_token_secret: hf-token-gemma + - model: mistralai/Mixtral-8x7B-Instruct-v0.1 # 56B total, 14B active MoE; tp=1 OOM (~87 GiB weights) + tp: 2 - # ── Gemma models ────────────────────────────────────────────────────────── - # Requires hf-token-gemma secret (separate from hf-token; Gemma repos are gated). - - model: google/gemma-2-2b-it # 2.6B dense; Gemma2 architecture + - model: moonshotai/Kimi-VL-A3B-Instruct # 16B total, 3B active MoE; vision-language; KimiVL architecture tp: 1 - hf_token_secret: hf-token-gemma + trust_remote_code: true - - model: google/gemma-2-9b-it # 9.2B dense; Gemma2 architecture - tp: 1 - hf_token_secret: hf-token-gemma + - model: moonshotai/Kimi-Dev-72B # 72B dense; Qwen2 architecture; tp=1 OOM (~144 GiB weights) + tp: 2 + trust_remote_code: true - - model: google/gemma-2-27b-it # 27.2B dense; fits H100 80GB at bf16 (~54 GiB weights) - tp: 1 - hf_token_secret: hf-token-gemma + - model: openai/gpt-oss-20b # 20B dense; GptOss architecture; mxfp4 quant from model config; sampler warmup needs ~786 MiB/GPU beyond KV + tp: 2 + gpu_memory_utilization: "0.90" # gmu=0.95 fails (OOM during sampler warmup); 0.90 frees ~4 GiB/GPU headroom + # openai/gpt-oss-120b skipped: OOM at all tested tp values - - model: google/gemma-3-4b-it # 4B dense; Gemma3 architecture + - model: Qwen/Qwen1.5-MoE-A2.7B # 14.3B total, 2.7B active MoE; Qwen2Moe architecture tp: 1 - hf_token_secret: hf-token-gemma - - model: google/gemma-3-12b-it # 12B dense; Gemma3 architecture - tp: 1 - hf_token_secret: hf-token-gemma + - model: Qwen/Qwen2.5-72B-Instruct # 72B dense; tp=1 OOM (~144 GiB weights) + tp: 2 - - model: google/gemma-3-27b-it # 27B dense; Gemma3 architecture + - model: Qwen/Qwen3-8B # 8B dense; Qwen3 architecture tp: 1 - hf_token_secret: hf-token-gemma - # ── Kimi Dev 72B TP sensitivity (retry tp=4; tp=2 succeeded) ───────────── - - model: moonshotai/Kimi-Dev-72B - tp: 4 - trust_remote_code: true - _sweep_dim: tp + - model: Qwen/Qwen3-30B-A3B # 30B total, 3B active MoE; Qwen3Moe architecture + tp: 1 - # ── Argument sensitivity: tensor parallelism ───────────────────────────── - # Llama-3.1-8B has 32 attention heads; tp=3 is invalid (32 % 3 ≠ 0). + # ── Sensitivity: tensor parallelism ───────────────────────────────────── + # Llama-3.1-8B: 32 attention heads; tp=3 invalid (32 % 3 ≠ 0). - model: meta-llama/Llama-3.1-8B-Instruct tp: [1, 2, 4] _sweep_dim: tp - - model: Qwen/Qwen2.5-7B-Instruct # 28 attention heads; valid tp: 1, 2, 4, 7, 14, 28 + # Qwen2.5-7B: 28 attention heads; valid tp: 1, 2, 4, 7, 14, 28. + - model: Qwen/Qwen2.5-7B-Instruct tp: [1, 2, 4] _sweep_dim: tp - # ── Argument sensitivity: data parallelism ──────────────────────────────── - - model: meta-llama/Llama-3.1-8B-Instruct - tp: 1 - dp: [1, 2] - _sweep_dim: dp + - model: moonshotai/Kimi-VL-A3B-Instruct # baseline at tp=1 in core section above + tp: 2 + trust_remote_code: true + _sweep_dim: tp + + - model: moonshotai/Kimi-Dev-72B # baseline at tp=2 in core section above + tp: 4 + trust_remote_code: true + _sweep_dim: tp - # ── Argument sensitivity: pipeline parallelism ─────────────────────────── + # ── Sensitivity: pipeline parallelism ─────────────────────────────────── + # pp=1 baseline is covered by Llama-3.1-8B-Instruct tp=1 entry above. - model: meta-llama/Llama-3.1-8B-Instruct tp: 1 pp: [2, 4] _sweep_dim: pp - # ── Argument sensitivity: --max-model-len ──────────────────────────────── + # ── Sensitivity: context length (max_model_len) ───────────────────────── + # len=8192 baseline is covered by the tp sweep entries above. + # KV pool (GiB) is independent of max_model_len; only token count changes. - model: meta-llama/Llama-3.1-8B-Instruct tp: 1 - max_model_len: [2048, 4096, 8192, 16384, 32768] + max_model_len: [2048, 4096, 16384, 32768] _sweep_dim: max_model_len - model: Qwen/Qwen2.5-7B-Instruct tp: 1 - max_model_len: [2048, 4096, 8192, 16384, 32768] + max_model_len: [2048, 4096, 16384, 32768] _sweep_dim: max_model_len - # ── Argument sensitivity: --dtype ───────────────────────────────────────── - # DONE: all three dtype values completed - # - model: meta-llama/Llama-3.1-8B-Instruct - # tp: 1 - # dtype: [float16, bfloat16, float32] - # kv_cache_dtype: auto - # _sweep_dim: dtype - - # ── Argument sensitivity: --kv-cache-dtype ──────────────────────────────── - # DONE: both auto and fp8 completed - # - model: meta-llama/Llama-3.1-8B-Instruct - # tp: 1 - # dtype: bfloat16 - # kv_cache_dtype: [auto, fp8] - # _sweep_dim: kv_cache_dtype - - # DONE: both auto and fp8 completed - # - model: Qwen/Qwen2.5-7B-Instruct - # tp: 1 - # dtype: bfloat16 - # kv_cache_dtype: [auto, fp8] - # _sweep_dim: kv_cache_dtype - - # ── Argument sensitivity: non-power-of-2 tp ────────────────────────────── - # INVALID: Qwen3-14B vocab_size=151936 is not divisible by 5 (vLLM enforces - # vocab sharding). num_attention_heads=40 supports tp=5, but vocab does not. - # Valid tp values: divisors of gcd(40, 151936)=8 → {1,2,4,8}. - # - model: Qwen/Qwen3-14B - # tp: 5 - # _sweep_dim: tp_odd - - # ── Argument sensitivity: --quantization ────────────────────────────────── - # DONE: FP16 baseline (same run_id as dtype=float16 above) - # - model: meta-llama/Llama-3.1-8B-Instruct # FP16 baseline - # tp: 1 - # dtype: float16 - # kv_cache_dtype: auto - # quantization: null - # _sweep_dim: quantization - - # runtime fp8 quantization — not yet done + # ── Sensitivity: --dtype ───────────────────────────────────────────────── + # bfloat16 baseline is covered by the tp sweep entries above. + - model: meta-llama/Llama-3.1-8B-Instruct + tp: 1 + dtype: float16 + kv_cache_dtype: auto + _sweep_dim: dtype + + - model: meta-llama/Llama-3.1-8B-Instruct + tp: 1 + dtype: float32 + kv_cache_dtype: auto + _sweep_dim: dtype + + # ── Sensitivity: --kv-cache-dtype ──────────────────────────────────────── + # kv_cache_dtype=auto baseline is covered by the tp sweep entries above. - model: meta-llama/Llama-3.1-8B-Instruct + tp: 1 + dtype: bfloat16 + kv_cache_dtype: fp8 + _sweep_dim: kv_cache_dtype + + - model: Qwen/Qwen2.5-7B-Instruct + tp: 1 + dtype: bfloat16 + kv_cache_dtype: fp8 + _sweep_dim: kv_cache_dtype + + # ── Sensitivity: runtime --quantization ────────────────────────────────── + # quantization=null baseline is covered by the tp sweep entries above. + - model: meta-llama/Llama-3.1-8B-Instruct # runtime fp8 inline quantization (not pre-quantized weights) tp: 1 quantization: fp8 _sweep_dim: quantization - # DONE: w8a8 - # - model: RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8 - # tp: 1 - # dtype: float16 - # kv_cache_dtype: auto - # quantization: null - # _label: w8a8-redhatai-llama-3-1-8b - # _sweep_dim: quantization - - # DONE: w4a16 - # - model: RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16 - # tp: 1 - # dtype: float16 - # kv_cache_dtype: auto - # quantization: null - # _label: w4a16-redhatai-llama-3-1-8b - # _sweep_dim: quantization - - # DONE: w8a8 Mistral (tp=1 and tp=2) - # - model: RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8 - # tp: 1 - # _label: w8a8-mistral-small-24b - # _sweep_dim: quantization - - # DONE: fp8-dynamic (tp=2 and tp=4) - # - model: RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic - # tp: 2 # tp=1 OOM: ~65 GiB fp8 weights leave <5 GiB KV on single H100 - # _label: fp8dyn-llama-3-3-70b - # _sweep_dim: quantization - - - model: redhatai/Llama-3.3-70B-Instruct-quantized.w8a8 - tp: 2 # tp=1 OOM for same reason - _sweep_dim: quantization + # Invalid: Qwen3-14B tp=5 rejected by vLLM — vocab_size=151936 not divisible by 5. + # Valid tp for Qwen3-14B: divisors of gcd(num_attention_heads=40, vocab_size=151936)=8 → {1,2,4,8}. - # DONE: w8a8 Qwen2.5 7B - # - model: RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8 - # tp: 1 - # _label: w8a8-redhatai-qwen2-5-7b - # _sweep_dim: quantization - - # DONE: fp8-dynamic Qwen2.5 7B - # - model: RedHatAI/Qwen2.5-7B-Instruct-fp8-dynamic - # tp: 1 - # _label: fp8dyn-redhatai-qwen2-5-7b - # _sweep_dim: quantization - - # ── vLLM version sensitivity: activation memory across releases ─────────── - # Goal: validate whether activation constants change across vLLM versions. - # Model: Qwen3-14B (tp=1, ~28 GiB bf16; tp=5 invalid — vocab not divisible). - # See sweep-versions.yaml to run these; results go in results/v/ - # DONE: v0.15.0 - # - model: Qwen/Qwen3-14B - # tp: 1 - # vllm_image: vllm/vllm-openai:v0.15.0 - # _label: qwen3-14b-vllm-v0.15.0 - # _sweep_dim: vllm_version - # DONE: v0.16.0 - # - model: Qwen/Qwen3-14B - # tp: 1 - # vllm_image: vllm/vllm-openai:v0.16.0 - # _label: qwen3-14b-vllm-v0.16.0 - # _sweep_dim: vllm_version - # DONE: v0.17.0 - # - model: Qwen/Qwen3-14B - # tp: 1 - # vllm_image: vllm/vllm-openai:v0.17.0 - # _label: qwen3-14b-vllm-v0.17.0 - # _sweep_dim: vllm_version - # DONE: v0.18.0 - # - model: Qwen/Qwen3-14B - # tp: 1 - # vllm_image: vllm/vllm-openai:v0.18.0 - # _label: qwen3-14b-vllm-v0.18.0 - # _sweep_dim: vllm_version + # ── Pre-quantized models (RedHat) ───────────────────────────────────────── + # Quantization is auto-detected from the model's quantization_config in the HF repo. + - model: RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic # fp8-dynamic weights; compressed-tensors + tp: 2 + _label: fp8dyn-llama-3-3-70b-tp2 + + - model: RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic + tp: 4 + _label: fp8dyn-llama-3-3-70b-tp4 + + - model: redhatai/Llama-3.3-70B-Instruct-quantized.w8a8 # w8a8 weights; compressed-tensors + tp: 2 + + - model: RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16 # w4a16 weights; gptq_marlin + tp: 1 + dtype: float16 + _label: w4a16-redhatai-llama-3-1-8b + + - model: RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8 # w8a8 weights; compressed-tensors + tp: 1 + dtype: float16 + _label: w8a8-redhatai-llama-3-1-8b + + - model: RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8 # w8a8 weights; compressed-tensors + tp: [1, 2] + _sweep_dim: tp + + - model: RedHatAI/Qwen2.5-7B-Instruct-fp8-dynamic # fp8-dynamic weights; compressed-tensors + tp: 1 + _label: fp8dyn-redhatai-qwen2-5-7b + + - model: RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8 # w8a8 weights; compressed-tensors + tp: 1 + _label: w8a8-redhatai-qwen2-5-7b From 2489dbbf3b3608082d2d7294760fffacf55f25a1 Mon Sep 17 00:00:00 2001 From: Jing Chen Date: Fri, 24 Apr 2026 14:32:26 -0400 Subject: [PATCH 14/24] Remove logs Signed-off-by: Jing Chen --- accuracy/README.md | 12 +- ...4b---h100-80gb--tp2pp1dp1--8192.FAILED.log | 308 -- ...llama-7b-h--h100-80gb--tp1pp1dp1--8192.log | 764 --- ...epseek-v2---h100-80gb--tp1pp1dp1--8192.log | 787 ---- ...ma-3-3-70b--h100-80gb--tp2pp1dp1--8192.log | 2150 --------- ...ma-3-3-70b--h100-80gb--tp4pp1dp1--8192.log | 4086 ----------------- ...i-qwen2-5---h100-80gb--tp1pp1dp1--8192.log | 864 ---- ...-it--h100-80gb--tp1pp1dp1--8192.FAILED.log | 133 - ...a-2-27b-it--h100-80gb--tp1pp1dp1--8192.log | 770 ---- ...-it--h100-80gb--tp1pp1dp1--8192.FAILED.log | 133 - ...ma-2-2b-it--h100-80gb--tp1pp1dp1--8192.log | 745 --- ...-it--h100-80gb--tp1pp1dp1--8192.FAILED.log | 133 - ...ma-2-9b-it--h100-80gb--tp1pp1dp1--8192.log | 749 --- ...-it--h100-80gb--tp1pp1dp1--8192.FAILED.log | 133 - ...a-3-12b-it--h100-80gb--tp1pp1dp1--8192.log | 769 ---- ...-it--h100-80gb--tp1pp1dp1--8192.FAILED.log | 133 - ...a-3-27b-it--h100-80gb--tp1pp1dp1--8192.log | 783 ---- ...-it--h100-80gb--tp1pp1dp1--8192.FAILED.log | 133 - ...ma-3-4b-it--h100-80gb--tp1pp1dp1--8192.log | 766 --- ...-it--h100-80gb--tp1pp1dp1--8192.FAILED.log | 77 - ...e-gemma-7b--h100-80gb--tp1pp1dp1--8192.log | 776 ---- ...ite-3-1-2b--h100-80gb--tp1pp1dp1--8192.log | 743 --- ...ite-3-1-8b--h100-80gb--tp1pp1dp1--8192.log | 746 --- ...anite-3-3---h100-80gb--tp1pp1dp1--8192.log | 746 --- ...anite-visi--h100-80gb--tp1pp1dp1--8192.log | 750 --- ...100-80gb--tp1pp1dp1--8192-dtbf16-kvfp8.log | 751 --- ...ma---h100-80gb--tp1pp1dp1--8192-dtbf16.log | 749 --- ...ma-3---h100-80gb--tp1pp1dp1--8192-qfp8.log | 1105 ----- ...ma-3--h100-80gb--tp1pp1dp1--8192-dtf16.log | 746 --- ...ma-3--h100-80gb--tp1pp1dp1--8192-dtf32.log | 750 --- ...ma-3-1-8b---h100-80gb--tp1pp1dp1--2048.log | 746 --- ...ma-3-1-8b---h100-80gb--tp1pp1dp1--4096.log | 747 --- ...ma-3-1-8b---h100-80gb--tp1pp1dp1--8192.log | 746 --- ...8b---h100-80gb--tp1pp1dp2--8192.FAILED.log | 370 -- ...ma-3-1-8b---h100-80gb--tp1pp2dp1--8192.log | 1398 ------ ...ma-3-1-8b---h100-80gb--tp1pp4dp1--8192.log | 2627 ----------- ...ma-3-1-8b---h100-80gb--tp2pp1dp1--8192.log | 1438 ------ ...ma-3-1-8b---h100-80gb--tp4pp1dp1--8192.log | 2768 ----------- ...ma-3-1-8b--h100-80gb--tp1pp1dp1--16384.log | 746 --- ...ma-3-1-8b--h100-80gb--tp1pp1dp1--32768.log | 747 --- ...out--h100-80gb--tp4pp1dp1--8192.FAILED.log | 440 -- ...ma-4-scout--h100-80gb--tp4pp1dp1--8192.log | 2342 ---------- ...soft-phi-2--h100-80gb--tp1pp1dp1--2048.log | 769 ---- ...i-2--h100-80gb--tp1pp1dp1--8192.FAILED.log | 80 - ...soft-phi-4--h100-80gb--tp1pp1dp1--8192.log | 752 --- ...ral-small---h100-80gb--tp1pp1dp1--8192.log | 919 ---- ...ral-8x7b-i--h100-80gb--tp2pp1dp1--8192.log | 1473 ------ ...mi-dev-72b--h100-80gb--tp2pp1dp1--8192.log | 1565 ------- ...72b--h100-80gb--tp4pp1dp1--8192.FAILED.log | 399 -- ...mi-dev-72b--h100-80gb--tp4pp1dp1--8192.log | 2854 ------------ ...i-vl-a3b-i--h100-80gb--tp1pp1dp1--8192.log | 1126 ----- ...i-vl-a3b-i--h100-80gb--tp2pp1dp1--8192.log | 2188 --------- ...20b--h100-80gb--tp1pp1dp1--8192.FAILED.log | 1112 ----- ...20b--h100-80gb--tp2pp1dp1--8192.FAILED.log | 2104 --------- ...pt-oss-20b--h100-80gb--tp2pp1dp1--8192.log | 1855 -------- ...-moe-a2-7b--h100-80gb--tp1pp1dp1--8192.log | 771 ---- ...100-80gb--tp1pp1dp1--8192-dtbf16-kvfp8.log | 750 --- ...2b-instruc--h100-80gb--tp2pp1dp1--8192.log | 1523 ------ ...b-i--h100-80gb--tp1pp1dp1--8192-dtbf16.log | 746 --- ...b-instruc--h100-80gb--tp1pp1dp1--16384.log | 747 --- ...b-instruc--h100-80gb--tp1pp1dp1--32768.log | 747 --- ...b-instruct--h100-80gb--tp1pp1dp1--2048.log | 746 --- ...b-instruct--h100-80gb--tp1pp1dp1--4096.log | 746 --- ...b-instruct--h100-80gb--tp1pp1dp1--8192.log | 749 --- ...b-instruct--h100-80gb--tp2pp1dp1--8192.log | 1437 ------ ...b-instruct--h100-80gb--tp4pp1dp1--8192.log | 2766 ----------- ...14b--h100-80gb--tp5pp1dp1--8192.FAILED.log | 675 --- ...n3-30b-a3b--h100-80gb--tp1pp1dp1--8192.log | 774 ---- ...n-qwen3-8b--h100-80gb--tp1pp1dp1--8192.log | 751 --- ...-3-3-70b-i--h100-80gb--tp2pp1dp1--8192.log | 2144 --------- ...-lla--h100-80gb--tp1pp1dp1--8192-dtf16.log | 745 --- ...-small-24b--h100-80gb--tp1pp1dp1--8192.log | 928 ---- ...-small-24b--h100-80gb--tp2pp1dp1--8192.log | 1847 -------- ...llam--h100-80gb--tp1pp1dp1--8192-dtf16.log | 878 ---- ...qwen2-5-7b--h100-80gb--tp1pp1dp1--8192.log | 861 ---- .../results/v0.19.0/results_predicted.csv | 73 - accuracy/results/v0.19.0/results_raw.csv | 75 - ...lama-7b-h--h100-80gb--tp1pp1dp1--8192.json | 25 - ...pseek-v2---h100-80gb--tp1pp1dp1--8192.json | 25 - ...a-3-3-70b--h100-80gb--tp2pp1dp1--8192.json | 26 - ...a-3-3-70b--h100-80gb--tp4pp1dp1--8192.json | 26 - ...-qwen2-5---h100-80gb--tp1pp1dp1--8192.json | 26 - ...-2-27b-it--h100-80gb--tp1pp1dp1--8192.json | 29 - ...a-2-2b-it--h100-80gb--tp1pp1dp1--8192.json | 29 - ...a-2-9b-it--h100-80gb--tp1pp1dp1--8192.json | 29 - ...-3-12b-it--h100-80gb--tp1pp1dp1--8192.json | 29 - ...-3-27b-it--h100-80gb--tp1pp1dp1--8192.json | 29 - ...a-3-4b-it--h100-80gb--tp1pp1dp1--8192.json | 29 - ...-gemma-7b--h100-80gb--tp1pp1dp1--8192.json | 29 - ...te-3-1-2b--h100-80gb--tp1pp1dp1--8192.json | 25 - ...te-3-1-8b--h100-80gb--tp1pp1dp1--8192.json | 25 - ...nite-3-3---h100-80gb--tp1pp1dp1--8192.json | 25 - ...nite-visi--h100-80gb--tp1pp1dp1--8192.json | 25 - ...00-80gb--tp1pp1dp1--8192-dtbf16-kvfp8.json | 26 - ...a---h100-80gb--tp1pp1dp1--8192-dtbf16.json | 26 - ...a-3---h100-80gb--tp1pp1dp1--8192-qfp8.json | 26 - ...a-3--h100-80gb--tp1pp1dp1--8192-dtf16.json | 26 - ...a-3--h100-80gb--tp1pp1dp1--8192-dtf32.json | 26 - ...a-3-1-8b---h100-80gb--tp1pp1dp1--2048.json | 26 - ...a-3-1-8b---h100-80gb--tp1pp1dp1--4096.json | 26 - ...a-3-1-8b---h100-80gb--tp1pp1dp1--8192.json | 26 - ...a-3-1-8b---h100-80gb--tp1pp2dp1--8192.json | 26 - ...a-3-1-8b---h100-80gb--tp1pp4dp1--8192.json | 26 - ...a-3-1-8b---h100-80gb--tp2pp1dp1--8192.json | 26 - ...a-3-1-8b---h100-80gb--tp4pp1dp1--8192.json | 26 - ...a-3-1-8b--h100-80gb--tp1pp1dp1--16384.json | 26 - ...a-3-1-8b--h100-80gb--tp1pp1dp1--32768.json | 26 - ...a-4-scout--h100-80gb--tp4pp1dp1--8192.json | 29 - ...oft-phi-2--h100-80gb--tp1pp1dp1--2048.json | 29 - ...oft-phi-4--h100-80gb--tp1pp1dp1--8192.json | 25 - ...al-small---h100-80gb--tp1pp1dp1--8192.json | 25 - ...al-8x7b-i--h100-80gb--tp2pp1dp1--8192.json | 25 - ...i-dev-72b--h100-80gb--tp2pp1dp1--8192.json | 25 - ...i-dev-72b--h100-80gb--tp4pp1dp1--8192.json | 26 - ...-vl-a3b-i--h100-80gb--tp1pp1dp1--8192.json | 25 - ...-vl-a3b-i--h100-80gb--tp2pp1dp1--8192.json | 25 - ...t-oss-20b--h100-80gb--tp2pp1dp1--8192.json | 29 - ...moe-a2-7b--h100-80gb--tp1pp1dp1--8192.json | 29 - ...00-80gb--tp1pp1dp1--8192-dtbf16-kvfp8.json | 26 - ...b-instruc--h100-80gb--tp2pp1dp1--8192.json | 25 - ...-i--h100-80gb--tp1pp1dp1--8192-dtbf16.json | 26 - ...-instruc--h100-80gb--tp1pp1dp1--16384.json | 26 - ...-instruc--h100-80gb--tp1pp1dp1--32768.json | 26 - ...-instruct--h100-80gb--tp1pp1dp1--2048.json | 26 - ...-instruct--h100-80gb--tp1pp1dp1--4096.json | 26 - ...-instruct--h100-80gb--tp1pp1dp1--8192.json | 25 - ...-instruct--h100-80gb--tp2pp1dp1--8192.json | 26 - ...-instruct--h100-80gb--tp4pp1dp1--8192.json | 26 - ...3-30b-a3b--h100-80gb--tp1pp1dp1--8192.json | 25 - ...-qwen3-8b--h100-80gb--tp1pp1dp1--8192.json | 25 - ...3-3-70b-i--h100-80gb--tp2pp1dp1--8192.json | 26 - ...lla--h100-80gb--tp1pp1dp1--8192-dtf16.json | 26 - ...small-24b--h100-80gb--tp1pp1dp1--8192.json | 26 - ...small-24b--h100-80gb--tp2pp1dp1--8192.json | 26 - ...lam--h100-80gb--tp1pp1dp1--8192-dtf16.json | 26 - ...wen2-5-7b--h100-80gb--tp1pp1dp1--8192.json | 26 - ...m-v0-15-0--h100-80gb--tp1pp1dp1--8192.json | 29 - ...m-v0-16-0--h100-80gb--tp1pp1dp1--8192.json | 29 - ...m-v0-17-0--h100-80gb--tp1pp1dp1--8192.json | 29 - ...m-v0-18-0--h100-80gb--tp1pp1dp1--8192.json | 30 - 140 files changed, 8 insertions(+), 77740 deletions(-) delete mode 100644 accuracy/results/v0.19.0/logs/codellama-codellama-34b---h100-80gb--tp2pp1dp1--8192.FAILED.log delete mode 100644 accuracy/results/v0.19.0/logs/codellama-codellama-7b-h--h100-80gb--tp1pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/deepseek-ai-deepseek-v2---h100-80gb--tp1pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/fp8dyn-llama-3-3-70b--h100-80gb--tp2pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/fp8dyn-llama-3-3-70b--h100-80gb--tp4pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/fp8dyn-redhatai-qwen2-5---h100-80gb--tp1pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/google-gemma-2-27b-it--h100-80gb--tp1pp1dp1--8192.FAILED.log delete mode 100644 accuracy/results/v0.19.0/logs/google-gemma-2-27b-it--h100-80gb--tp1pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/google-gemma-2-2b-it--h100-80gb--tp1pp1dp1--8192.FAILED.log delete mode 100644 accuracy/results/v0.19.0/logs/google-gemma-2-2b-it--h100-80gb--tp1pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/google-gemma-2-9b-it--h100-80gb--tp1pp1dp1--8192.FAILED.log delete mode 100644 accuracy/results/v0.19.0/logs/google-gemma-2-9b-it--h100-80gb--tp1pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/google-gemma-3-12b-it--h100-80gb--tp1pp1dp1--8192.FAILED.log delete mode 100644 accuracy/results/v0.19.0/logs/google-gemma-3-12b-it--h100-80gb--tp1pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/google-gemma-3-27b-it--h100-80gb--tp1pp1dp1--8192.FAILED.log delete mode 100644 accuracy/results/v0.19.0/logs/google-gemma-3-27b-it--h100-80gb--tp1pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/google-gemma-3-4b-it--h100-80gb--tp1pp1dp1--8192.FAILED.log delete mode 100644 accuracy/results/v0.19.0/logs/google-gemma-3-4b-it--h100-80gb--tp1pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/google-gemma-4-e4b-it--h100-80gb--tp1pp1dp1--8192.FAILED.log delete mode 100644 accuracy/results/v0.19.0/logs/google-gemma-7b--h100-80gb--tp1pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/granite-3-1-2b--h100-80gb--tp1pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/granite-3-1-8b--h100-80gb--tp1pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/ibm-granite-granite-3-3---h100-80gb--tp1pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/ibm-granite-granite-visi--h100-80gb--tp1pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/meta-llama---h100-80gb--tp1pp1dp1--8192-dtbf16-kvfp8.log delete mode 100644 accuracy/results/v0.19.0/logs/meta-llama-llama---h100-80gb--tp1pp1dp1--8192-dtbf16.log delete mode 100644 accuracy/results/v0.19.0/logs/meta-llama-llama-3---h100-80gb--tp1pp1dp1--8192-qfp8.log delete mode 100644 accuracy/results/v0.19.0/logs/meta-llama-llama-3--h100-80gb--tp1pp1dp1--8192-dtf16.log delete mode 100644 accuracy/results/v0.19.0/logs/meta-llama-llama-3--h100-80gb--tp1pp1dp1--8192-dtf32.log delete mode 100644 accuracy/results/v0.19.0/logs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp1dp1--2048.log delete mode 100644 accuracy/results/v0.19.0/logs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp1dp1--4096.log delete mode 100644 accuracy/results/v0.19.0/logs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp1dp2--8192.FAILED.log delete mode 100644 accuracy/results/v0.19.0/logs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp2dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp4dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/meta-llama-llama-3-1-8b---h100-80gb--tp2pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/meta-llama-llama-3-1-8b---h100-80gb--tp4pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/meta-llama-llama-3-1-8b--h100-80gb--tp1pp1dp1--16384.log delete mode 100644 accuracy/results/v0.19.0/logs/meta-llama-llama-3-1-8b--h100-80gb--tp1pp1dp1--32768.log delete mode 100644 accuracy/results/v0.19.0/logs/meta-llama-llama-4-scout--h100-80gb--tp4pp1dp1--8192.FAILED.log delete mode 100644 accuracy/results/v0.19.0/logs/meta-llama-llama-4-scout--h100-80gb--tp4pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/microsoft-phi-2--h100-80gb--tp1pp1dp1--2048.log delete mode 100644 accuracy/results/v0.19.0/logs/microsoft-phi-2--h100-80gb--tp1pp1dp1--8192.FAILED.log delete mode 100644 accuracy/results/v0.19.0/logs/microsoft-phi-4--h100-80gb--tp1pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/mistralai-mistral-small---h100-80gb--tp1pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/mistralai-mixtral-8x7b-i--h100-80gb--tp2pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/moonshotai-kimi-dev-72b--h100-80gb--tp2pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/moonshotai-kimi-dev-72b--h100-80gb--tp4pp1dp1--8192.FAILED.log delete mode 100644 accuracy/results/v0.19.0/logs/moonshotai-kimi-dev-72b--h100-80gb--tp4pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/moonshotai-kimi-vl-a3b-i--h100-80gb--tp1pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/moonshotai-kimi-vl-a3b-i--h100-80gb--tp2pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/openai-gpt-oss-20b--h100-80gb--tp1pp1dp1--8192.FAILED.log delete mode 100644 accuracy/results/v0.19.0/logs/openai-gpt-oss-20b--h100-80gb--tp2pp1dp1--8192.FAILED.log delete mode 100644 accuracy/results/v0.19.0/logs/openai-gpt-oss-20b--h100-80gb--tp2pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/qwen-qwen1-5-moe-a2-7b--h100-80gb--tp1pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/qwen-qwen2---h100-80gb--tp1pp1dp1--8192-dtbf16-kvfp8.log delete mode 100644 accuracy/results/v0.19.0/logs/qwen-qwen2-5-72b-instruc--h100-80gb--tp2pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/qwen-qwen2-5-7b-i--h100-80gb--tp1pp1dp1--8192-dtbf16.log delete mode 100644 accuracy/results/v0.19.0/logs/qwen-qwen2-5-7b-instruc--h100-80gb--tp1pp1dp1--16384.log delete mode 100644 accuracy/results/v0.19.0/logs/qwen-qwen2-5-7b-instruc--h100-80gb--tp1pp1dp1--32768.log delete mode 100644 accuracy/results/v0.19.0/logs/qwen-qwen2-5-7b-instruct--h100-80gb--tp1pp1dp1--2048.log delete mode 100644 accuracy/results/v0.19.0/logs/qwen-qwen2-5-7b-instruct--h100-80gb--tp1pp1dp1--4096.log delete mode 100644 accuracy/results/v0.19.0/logs/qwen-qwen2-5-7b-instruct--h100-80gb--tp1pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/qwen-qwen2-5-7b-instruct--h100-80gb--tp2pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/qwen-qwen2-5-7b-instruct--h100-80gb--tp4pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/qwen-qwen3-14b--h100-80gb--tp5pp1dp1--8192.FAILED.log delete mode 100644 accuracy/results/v0.19.0/logs/qwen-qwen3-30b-a3b--h100-80gb--tp1pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/qwen-qwen3-8b--h100-80gb--tp1pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/redhatai-llama-3-3-70b-i--h100-80gb--tp2pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/w4a16-redhatai-lla--h100-80gb--tp1pp1dp1--8192-dtf16.log delete mode 100644 accuracy/results/v0.19.0/logs/w8a8-mistral-small-24b--h100-80gb--tp1pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/w8a8-mistral-small-24b--h100-80gb--tp2pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/logs/w8a8-redhatai-llam--h100-80gb--tp1pp1dp1--8192-dtf16.log delete mode 100644 accuracy/results/v0.19.0/logs/w8a8-redhatai-qwen2-5-7b--h100-80gb--tp1pp1dp1--8192.log delete mode 100644 accuracy/results/v0.19.0/results_predicted.csv delete mode 100644 accuracy/results/v0.19.0/results_raw.csv delete mode 100644 accuracy/results/v0.19.0/runs/codellama-codellama-7b-h--h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/deepseek-ai-deepseek-v2---h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/fp8dyn-llama-3-3-70b--h100-80gb--tp2pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/fp8dyn-llama-3-3-70b--h100-80gb--tp4pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/fp8dyn-redhatai-qwen2-5---h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/google-gemma-2-27b-it--h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/google-gemma-2-2b-it--h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/google-gemma-2-9b-it--h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/google-gemma-3-12b-it--h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/google-gemma-3-27b-it--h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/google-gemma-3-4b-it--h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/google-gemma-7b--h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/granite-3-1-2b--h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/granite-3-1-8b--h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/ibm-granite-granite-3-3---h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/ibm-granite-granite-visi--h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/meta-llama---h100-80gb--tp1pp1dp1--8192-dtbf16-kvfp8.json delete mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama---h100-80gb--tp1pp1dp1--8192-dtbf16.json delete mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-3---h100-80gb--tp1pp1dp1--8192-qfp8.json delete mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-3--h100-80gb--tp1pp1dp1--8192-dtf16.json delete mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-3--h100-80gb--tp1pp1dp1--8192-dtf32.json delete mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp1dp1--2048.json delete mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp1dp1--4096.json delete mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp2dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp1pp4dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp2pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b---h100-80gb--tp4pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b--h100-80gb--tp1pp1dp1--16384.json delete mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-3-1-8b--h100-80gb--tp1pp1dp1--32768.json delete mode 100644 accuracy/results/v0.19.0/runs/meta-llama-llama-4-scout--h100-80gb--tp4pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/microsoft-phi-2--h100-80gb--tp1pp1dp1--2048.json delete mode 100644 accuracy/results/v0.19.0/runs/microsoft-phi-4--h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/mistralai-mistral-small---h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/mistralai-mixtral-8x7b-i--h100-80gb--tp2pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/moonshotai-kimi-dev-72b--h100-80gb--tp2pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/moonshotai-kimi-dev-72b--h100-80gb--tp4pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/moonshotai-kimi-vl-a3b-i--h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/moonshotai-kimi-vl-a3b-i--h100-80gb--tp2pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/openai-gpt-oss-20b--h100-80gb--tp2pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen1-5-moe-a2-7b--h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen2---h100-80gb--tp1pp1dp1--8192-dtbf16-kvfp8.json delete mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen2-5-72b-instruc--h100-80gb--tp2pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-i--h100-80gb--tp1pp1dp1--8192-dtbf16.json delete mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruc--h100-80gb--tp1pp1dp1--16384.json delete mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruc--h100-80gb--tp1pp1dp1--32768.json delete mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruct--h100-80gb--tp1pp1dp1--2048.json delete mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruct--h100-80gb--tp1pp1dp1--4096.json delete mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruct--h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruct--h100-80gb--tp2pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen2-5-7b-instruct--h100-80gb--tp4pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen3-30b-a3b--h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/qwen-qwen3-8b--h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/redhatai-llama-3-3-70b-i--h100-80gb--tp2pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/w4a16-redhatai-lla--h100-80gb--tp1pp1dp1--8192-dtf16.json delete mode 100644 accuracy/results/v0.19.0/runs/w8a8-mistral-small-24b--h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/w8a8-mistral-small-24b--h100-80gb--tp2pp1dp1--8192.json delete mode 100644 accuracy/results/v0.19.0/runs/w8a8-redhatai-llam--h100-80gb--tp1pp1dp1--8192-dtf16.json delete mode 100644 accuracy/results/v0.19.0/runs/w8a8-redhatai-qwen2-5-7b--h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/version-sweep/runs/qwen3-14b-vllm-v0-15-0--h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/version-sweep/runs/qwen3-14b-vllm-v0-16-0--h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/version-sweep/runs/qwen3-14b-vllm-v0-17-0--h100-80gb--tp1pp1dp1--8192.json delete mode 100644 accuracy/results/version-sweep/runs/qwen3-14b-vllm-v0-18-0--h100-80gb--tp1pp1dp1--8192.json diff --git a/accuracy/README.md b/accuracy/README.md index eba6f212..e6aaaf3c 100644 --- a/accuracy/README.md +++ b/accuracy/README.md @@ -117,7 +117,7 @@ python accuracy/scripts/collect.py # Results land in: data/benchmarks/memory/v0.19.0/runs/ and .../logs/ ``` -Copy the new run JSONs into `accuracy/results/v0.19.0/runs/`, then generate the report. +Copy the new run JSONs into `accuracy/results/v0.19.0/runs/` (this directory is gitignored), then generate the report. `analyze.py` calls the capacity planner directly to compute predictions — no separate calibration step needed. For gated models pass `--hf-token ` (only fetches `config.json`, not model weights): @@ -133,10 +133,14 @@ python accuracy/scripts/deep_analysis.py \ --out accuracy/results/v0.19.0/deep_analysis.md ``` -## Reproducing from committed results (no cluster needed) +## Reproducing from existing results (no cluster needed) -The raw run JSONs are committed in `accuracy/results/v0.19.0/runs/`. To regenerate -the report and analysis locally: +The raw run JSONs are not committed to git (logs and runs are large; see `.gitignore`). +Download the `results/` folder from Google Drive and place it at `accuracy/results/`: + +**[Download results/ from Google Drive](https://drive.google.com/drive/folders/1a0y2gdhcpKcFxm4RsqXUKWW40Gpd2Kx5?usp=sharing)** + +Once downloaded, regenerate the report and analysis locally: ```bash uv run python accuracy/scripts/analyze.py \ diff --git a/accuracy/results/v0.19.0/logs/codellama-codellama-34b---h100-80gb--tp2pp1dp1--8192.FAILED.log b/accuracy/results/v0.19.0/logs/codellama-codellama-34b---h100-80gb--tp2pp1dp1--8192.FAILED.log deleted file mode 100644 index c54e2f3c..00000000 --- a/accuracy/results/v0.19.0/logs/codellama-codellama-34b---h100-80gb--tp2pp1dp1--8192.FAILED.log +++ /dev/null @@ -1,308 +0,0 @@ -DEBUG 04-22 15:13:32 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 15:13:32 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 15:13:32 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 15:13:32 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 15:13:32 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 15:13:32 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 15:13:32 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 15:13:32 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 15:13:32 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 15:13:32 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 15:13:32 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 15:13:32 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 15:13:37 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 15:13:39 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' -DEBUG 04-22 15:13:39 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 15:13:39 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 15:13:39 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 15:13:39 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 15:13:39 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 15:13:39 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 15:13:39 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 15:13:39 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model codellama/CodeLlama-34b-hf -(APIServer pid=1) INFO 04-22 15:13:39 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 15:13:39 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 15:13:39 [entrypoints/utils.py:233] non-default args: {'model_tag': 'codellama/CodeLlama-34b-hf', 'model': 'codellama/CodeLlama-34b-hf', 'max_model_len': 8192, 'tensor_parallel_size': 2, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 15:13:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 15:13:39 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 15:13:39 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003513 secs -(APIServer pid=1) INFO 04-22 15:13:39 [config/model.py:549] Resolved architecture: LlamaForCausalLM -(APIServer pid=1) INFO 04-22 15:13:39 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 15:13:39 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 15:13:39 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 15:13:39 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 15:13:39 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 15:13:39 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 15:13:39 [config/parallel.py:743] Defaulting to use mp for distributed inference -(APIServer pid=1) INFO 04-22 15:13:39 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 15:13:39 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(APIServer pid=1) INFO 04-22 15:13:40 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(APIServer pid=1) DEBUG 04-22 15:13:40 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 15:13:40 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 15:13:41 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 15:13:41 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 15:13:45 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 15:13:45 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 15:13:45 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 15:13:45 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 15:13:45 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 15:13:45 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 15:13:45 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 15:13:45 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 15:13:45 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 15:13:45 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 15:13:45 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 15:13:45 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 15:13:50 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=285) DEBUG 04-22 15:13:51 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 15:13:51 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=285) DEBUG 04-22 15:13:51 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/6031e569-78d3-4367-a319-d00f5784bdee'], outputs=['ipc:///tmp/8009dedd-8f40-47ff-b55d-2493b94b720f'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=285) DEBUG 04-22 15:13:51 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=285) DEBUG 04-22 15:13:51 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=285) DEBUG 04-22 15:13:51 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=285) DEBUG 04-22 15:13:51 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=285) DEBUG 04-22 15:13:51 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=285) INFO 04-22 15:13:51 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='codellama/CodeLlama-34b-hf', speculative_config=None, tokenizer='codellama/CodeLlama-34b-hf', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=codellama/CodeLlama-34b-hf, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [4096, 8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=285) WARNING 04-22 15:13:51 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. -(EngineCore pid=285) INFO 04-22 15:13:51 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.128.10.77 (local), world_size=2, local_world_size=2 -(EngineCore pid=285) DEBUG 04-22 15:13:51 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/102b683d-6a30-49da-beae-627ed84291b1 -(EngineCore pid=285) DEBUG 04-22 15:13:51 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1], buffer_handle=(2, 16777216, 10, 'psm_18c33fdd'), local_subscribe_addr='ipc:///tmp/102b683d-6a30-49da-beae-627ed84291b1', local_notify_addr='ipc:///tmp/92e6e20f-9b40-4887-850b-cf71b0aa0473', remote_subscribe_addr=None, remote_addr_ipv6=False) -DEBUG 04-22 15:13:55 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 15:13:55 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 15:13:55 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 15:13:55 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 15:13:55 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 15:13:55 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 15:13:55 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 15:13:55 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 15:13:55 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 15:13:55 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 15:13:55 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 15:13:55 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 15:13:55 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 15:13:55 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 15:13:55 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 15:13:55 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 15:13:55 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 15:13:55 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 15:13:55 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 15:13:55 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 15:13:55 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 15:13:55 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 15:13:55 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 15:13:55 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 15:13:59 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 15:13:59 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 15:14:01 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 15:14:01 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 15:14:01 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 15:14:01 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 15:14:01 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 15:14:01 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 15:14:01 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 15:14:01 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(Worker pid=484) DEBUG 04-22 15:14:01 [distributed/parallel_state.py:1356] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:45259 backend=nccl -(Worker pid=484) INFO 04-22 15:14:01 [distributed/parallel_state.py:1400] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:45259 backend=nccl -(APIServer pid=1) DEBUG 04-22 15:14:01 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker pid=485) DEBUG 04-22 15:14:01 [distributed/parallel_state.py:1356] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:45259 backend=nccl -(Worker pid=485) INFO 04-22 15:14:01 [distributed/parallel_state.py:1400] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:45259 backend=nccl -[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -(Worker pid=485) DEBUG 04-22 15:14:01 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=484) DEBUG 04-22 15:14:01 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -(Worker pid=485) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=485) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=484) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=484) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=484) DEBUG 04-22 15:14:02 [utils/nccl.py:34] Found nccl from library libnccl.so.2 -(Worker pid=484) INFO 04-22 15:14:02 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 -(Worker pid=485) DEBUG 04-22 15:14:02 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=484) DEBUG 04-22 15:14:02 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=484) DEBUG 04-22 15:14:02 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/6e43b6cb-dfa5-481f-9a41-9cfb8dbd05d1 -(Worker pid=484) DEBUG 04-22 15:14:02 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1], buffer_handle=(1, 4194304, 6, 'psm_832a28d6'), local_subscribe_addr='ipc:///tmp/6e43b6cb-dfa5-481f-9a41-9cfb8dbd05d1', local_notify_addr='ipc:///tmp/c880f7ba-c003-469c-b499-2c3408a7c327', remote_subscribe_addr=None, remote_addr_ipv6=False) -(Worker pid=485) DEBUG 04-22 15:14:02 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/6e43b6cb-dfa5-481f-9a41-9cfb8dbd05d1 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(Worker pid=484) INFO 04-22 15:14:02 [distributed/parallel_state.py:1716] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] WorkerProc failed to start. -(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] Traceback (most recent call last): -(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 826, in worker_main -(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] worker = WorkerProc(*args, **kwargs) -(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) -(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ -(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 605, in __init__ -(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] self.worker.init_device() -(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 312, in init_device -(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] self.worker.init_device() # type: ignore -(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) -(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ -(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 283, in init_device -(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] self.requested_memory = request_memory(init_snapshot, self.cache_config) -(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/utils.py", line 413, in request_memory -(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] raise ValueError( -(Worker pid=485) ERROR 04-22 15:14:02 [v1/executor/multiproc_executor.py:857] ValueError: Free memory on device cuda:1 (60.87/79.19 GiB) on startup is less than desired GPU memory utilization (0.95, 75.23 GiB). Decrease GPU memory utilization or reduce GPU memory used by other processes. -(EngineCore pid=285) DEBUG 04-22 15:14:02 [v1/executor/multiproc_executor.py:419] Worker Termination: allow workers to gracefully shutdown -(Worker pid=484) DEBUG 04-22 15:14:03 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.66GiB, total_memory=79.19GiB, cuda_memory=1.53GiB, torch_memory=0.02GiB, non_torch_memory=1.51GiB, timestamp=1776870843.010323, auto_measure=True -(Worker pid=484) DEBUG 04-22 15:14:03 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=484) DEBUG 04-22 15:14:03 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=484) DEBUG 04-22 15:14:03 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=484) DEBUG 04-22 15:14:03 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=484) DEBUG 04-22 15:14:03 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=484) DEBUG 04-22 15:14:03 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(Worker pid=484) DEBUG 04-22 15:14:03 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=484) DEBUG 04-22 15:14:03 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(Worker_TP0 pid=484) INFO 04-22 15:14:03 [v1/worker/gpu_model_runner.py:4735] Starting to load model codellama/CodeLlama-34b-hf... -(Worker_TP0 pid=484) DEBUG 04-22 15:14:03 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(Worker_TP0 pid=484) INFO 04-22 15:14:03 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(Worker_TP0 pid=484) INFO 04-22 15:14:03 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(Worker_TP0 pid=484) DEBUG 04-22 15:14:03 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP0 pid=484) DEBUG 04-22 15:14:03 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP0 pid=484) DEBUG 04-22 15:14:03 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 97, 'silu_and_mul': 48, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP0 pid=484) DEBUG 04-22 15:14:03 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP0 pid=484) DEBUG 04-22 15:14:04 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00007.safetensors', 'model-00007-of-00007.safetensors', 'model-00005-of-00007.safetensors', 'model-00004-of-00007.safetensors', 'model-00006-of-00007.safetensors', 'model-00003-of-00007.safetensors', 'model-00001-of-00007.safetensors']] -(EngineCore pid=285) DEBUG 04-22 15:14:06 [v1/executor/multiproc_executor.py:424] Worker Termination: workers still running sending SIGTERM -(EngineCore pid=285) DEBUG 04-22 15:14:10 [v1/executor/multiproc_executor.py:429] Worker Termination: resorting to SIGKILL to take down workers -(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] EngineCore failed to start. -(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] Traceback (most recent call last): -(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1082, in run_engine_core -(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs) -(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] return func(*args, **kwargs) -(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^ -(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 848, in __init__ -(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] super().__init__( -(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 114, in __init__ -(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] self.model_executor = executor_class(vllm_config) -(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 101, in __init__ -(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] super().__init__(vllm_config) -(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] return func(*args, **kwargs) -(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^ -(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 103, in __init__ -(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] self._init_executor() -(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 190, in _init_executor -(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] self.workers = WorkerProc.wait_for_ready(unready_workers) -(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 736, in wait_for_ready -(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] raise e from None -(EngineCore pid=285) ERROR 04-22 15:14:10 [v1/engine/core.py:1108] Exception: WorkerProc initialization failed due to an exception in a background process. See stack trace for root cause. -(EngineCore pid=285) Process EngineCore: -(EngineCore pid=285) Traceback (most recent call last): -(EngineCore pid=285) File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap -(EngineCore pid=285) self.run() -(EngineCore pid=285) File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run -(EngineCore pid=285) self._target(*self._args, **self._kwargs) -(EngineCore pid=285) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1112, in run_engine_core -(EngineCore pid=285) raise e -(EngineCore pid=285) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1082, in run_engine_core -(EngineCore pid=285) engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs) -(EngineCore pid=285) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(EngineCore pid=285) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(EngineCore pid=285) return func(*args, **kwargs) -(EngineCore pid=285) ^^^^^^^^^^^^^^^^^^^^^ -(EngineCore pid=285) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 848, in __init__ -(EngineCore pid=285) super().__init__( -(EngineCore pid=285) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 114, in __init__ -(EngineCore pid=285) self.model_executor = executor_class(vllm_config) -(EngineCore pid=285) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(EngineCore pid=285) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 101, in __init__ -(EngineCore pid=285) super().__init__(vllm_config) -(EngineCore pid=285) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(EngineCore pid=285) return func(*args, **kwargs) -(EngineCore pid=285) ^^^^^^^^^^^^^^^^^^^^^ -(EngineCore pid=285) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 103, in __init__ -(EngineCore pid=285) self._init_executor() -(EngineCore pid=285) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 190, in _init_executor -(EngineCore pid=285) self.workers = WorkerProc.wait_for_ready(unready_workers) -(EngineCore pid=285) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(EngineCore pid=285) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 736, in wait_for_ready -(EngineCore pid=285) raise e from None -(EngineCore pid=285) Exception: WorkerProc initialization failed due to an exception in a background process. See stack trace for root cause. -(EngineCore pid=285) DEBUG 04-22 15:14:11 [v1/executor/multiproc_executor.py:438] Triggering shutdown of workers -(APIServer pid=1) DEBUG 04-22 15:14:11 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) Traceback (most recent call last): -(APIServer pid=1) File "/usr/local/bin/vllm", line 10, in -(APIServer pid=1) sys.exit(main()) -(APIServer pid=1) ^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main -(APIServer pid=1) args.dispatch_function(args) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd -(APIServer pid=1) uvloop.run(run_server(args)) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run -(APIServer pid=1) return __asyncio.run( -(APIServer pid=1) ^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run -(APIServer pid=1) return runner.run(main) -(APIServer pid=1) ^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run -(APIServer pid=1) return self._loop.run_until_complete(task) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper -(APIServer pid=1) return await main -(APIServer pid=1) ^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server -(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker -(APIServer pid=1) async with build_async_engine_client( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ -(APIServer pid=1) return await anext(self.gen) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client -(APIServer pid=1) async with build_async_engine_client_from_engine_args( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ -(APIServer pid=1) return await anext(self.gen) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 136, in build_async_engine_client_from_engine_args -(APIServer pid=1) async_llm = AsyncLLM.from_vllm_config( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 225, in from_vllm_config -(APIServer pid=1) return cls( -(APIServer pid=1) ^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 154, in __init__ -(APIServer pid=1) self.engine_core = EngineCoreClient.make_async_mp_client( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(APIServer pid=1) return func(*args, **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 130, in make_async_mp_client -(APIServer pid=1) return AsyncMPClient(*client_args) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(APIServer pid=1) return func(*args, **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 887, in __init__ -(APIServer pid=1) super().__init__( -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 535, in __init__ -(APIServer pid=1) with launch_core_engines( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 144, in __exit__ -(APIServer pid=1) next(self.gen) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 998, in launch_core_engines -(APIServer pid=1) wait_for_engine_startup( -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 1057, in wait_for_engine_startup -(APIServer pid=1) raise RuntimeError( -(APIServer pid=1) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} -/usr/lib/python3.12/multiprocessing/resource_tracker.py:279: UserWarning: resource_tracker: There appear to be 1 leaked semaphore objects to clean up at shutdown - warnings.warn('resource_tracker: There appear to be %d ' -/usr/lib/python3.12/multiprocessing/resource_tracker.py:279: UserWarning: resource_tracker: There appear to be 1 leaked shared_memory objects to clean up at shutdown - warnings.warn('resource_tracker: There appear to be %d ' diff --git a/accuracy/results/v0.19.0/logs/codellama-codellama-7b-h--h100-80gb--tp1pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/codellama-codellama-7b-h--h100-80gb--tp1pp1dp1--8192.log deleted file mode 100644 index c0dada63..00000000 --- a/accuracy/results/v0.19.0/logs/codellama-codellama-7b-h--h100-80gb--tp1pp1dp1--8192.log +++ /dev/null @@ -1,764 +0,0 @@ -DEBUG 04-22 15:11:52 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 15:11:52 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 15:11:52 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 15:11:52 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 15:11:53 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 15:11:53 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 15:11:53 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 15:11:53 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 15:11:53 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 15:11:53 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 15:11:53 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 15:11:53 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 15:11:58 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 15:12:00 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' -DEBUG 04-22 15:12:00 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 15:12:00 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 15:12:00 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 15:12:00 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 15:12:00 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 15:12:00 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 15:12:00 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 15:12:00 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model codellama/CodeLlama-7b-hf -(APIServer pid=1) INFO 04-22 15:12:00 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 15:12:00 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 15:12:00 [entrypoints/utils.py:233] non-default args: {'model_tag': 'codellama/CodeLlama-7b-hf', 'model': 'codellama/CodeLlama-7b-hf', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 15:12:00 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 15:12:11 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 15:12:11 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0034601 secs -(APIServer pid=1) INFO 04-22 15:12:11 [config/model.py:549] Resolved architecture: LlamaForCausalLM -(APIServer pid=1) INFO 04-22 15:12:11 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 15:12:11 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 15:12:11 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 15:12:11 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 15:12:11 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 15:12:11 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 15:12:11 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 15:12:11 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 15:12:12 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 15:12:12 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 15:12:13 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 15:12:13 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 15:12:17 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 15:12:17 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 15:12:17 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 15:12:17 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 15:12:17 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 15:12:17 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 15:12:17 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 15:12:17 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 15:12:17 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 15:12:17 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 15:12:17 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 15:12:17 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 15:12:21 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=283) DEBUG 04-22 15:12:23 [v1/engine/core.py:1018] Waiting for init message from front-end. -(EngineCore pid=283) DEBUG 04-22 15:12:23 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/c0d6876b-7742-4367-b174-65491d8d5016'], outputs=['ipc:///tmp/1fb926e6-0134-40d0-9a6b-b4cc9cc044c7'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(APIServer pid=1) DEBUG 04-22 15:12:23 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=283) DEBUG 04-22 15:12:23 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=283) DEBUG 04-22 15:12:23 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=283) DEBUG 04-22 15:12:23 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=283) DEBUG 04-22 15:12:23 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=283) DEBUG 04-22 15:12:23 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=283) INFO 04-22 15:12:23 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='codellama/CodeLlama-7b-hf', speculative_config=None, tokenizer='codellama/CodeLlama-7b-hf', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=codellama/CodeLlama-7b-hf, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=283) DEBUG 04-22 15:12:23 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.10.75:52901 backend=nccl -(EngineCore pid=283) INFO 04-22 15:12:23 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.10.75:52901 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=283) DEBUG 04-22 15:12:23 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=283) INFO 04-22 15:12:23 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=283) DEBUG 04-22 15:12:24 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776870744.2147915, auto_measure=True -(EngineCore pid=283) DEBUG 04-22 15:12:24 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=283) DEBUG 04-22 15:12:24 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=283) DEBUG 04-22 15:12:24 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=283) DEBUG 04-22 15:12:24 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=283) DEBUG 04-22 15:12:24 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=283) DEBUG 04-22 15:12:24 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=283) DEBUG 04-22 15:12:24 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=283) DEBUG 04-22 15:12:24 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=283) INFO 04-22 15:12:24 [v1/worker/gpu_model_runner.py:4735] Starting to load model codellama/CodeLlama-7b-hf... -(EngineCore pid=283) DEBUG 04-22 15:12:25 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=283) INFO 04-22 15:12:25 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=283) INFO 04-22 15:12:25 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=283) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=283) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=283) DEBUG 04-22 15:12:25 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=283) DEBUG 04-22 15:12:25 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=283) DEBUG 04-22 15:12:25 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=283) DEBUG 04-22 15:12:25 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=283) DEBUG 04-22 15:12:25 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00002.safetensors', 'model-00001-of-00002.safetensors']] -(APIServer pid=1) DEBUG 04-22 15:12:33 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 15:12:43 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=283) INFO 04-22 15:12:51 [model_executor/model_loader/weight_utils.py:581] Time spent downloading weights for codellama/CodeLlama-7b-hf: 25.845622 seconds -(EngineCore pid=283) Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00 -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=283) INFO 04-22 15:13:07 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/b9a1c77cfa/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=b999767673 comp=e546579c48 code=d4902a9f99bbc2392c45e4b4dc801f4424a11306edb95a61851b1062db20b8c4 dir=/data/.cache/vllm/torch_compile_cache/b9a1c77cfa/rank_0_0/backbone -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/backends.py:1074] Vllm config hash: b999767673 -(EngineCore pid=283) INFO 04-22 15:13:07 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.37 s -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=283) DEBUG 04-22 15:13:07 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=283) DEBUG 04-22 15:13:08 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=283) DEBUG 04-22 15:13:08 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(EngineCore pid=283) DEBUG 04-22 15:13:08 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.0 ms -(EngineCore pid=283) DEBUG 04-22 15:13:08 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=283) DEBUG 04-22 15:13:08 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=283) INFO 04-22 15:13:09 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=283) DEBUG 04-22 15:13:09 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/b9a1c77cfa/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=283) DEBUG 04-22 15:13:09 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=283) DEBUG 04-22 15:13:09 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(EngineCore pid=283) DEBUG 04-22 15:13:09 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms -(EngineCore pid=283) DEBUG 04-22 15:13:09 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=283) DEBUG 04-22 15:13:09 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=283) DEBUG 04-22 15:13:10 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/b9a1c77cfa/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=283) DEBUG 04-22 15:13:11 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=283) DEBUG 04-22 15:13:11 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.1 ms -(EngineCore pid=283) DEBUG 04-22 15:13:11 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.0 ms -(EngineCore pid=283) DEBUG 04-22 15:13:11 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=283) DEBUG 04-22 15:13:11 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(EngineCore pid=283) DEBUG 04-22 15:13:12 [compilation/backends.py:377] Store the 32-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_32', '/data/.cache/vllm/torch_compile_cache/b9a1c77cfa/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_32') -(EngineCore pid=283) INFO 04-22 15:13:12 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 4.21 s -(EngineCore pid=283) DEBUG 04-22 15:13:12 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/b9a1c77cfa/rank_0_0/backbone/computation_graph.py -(APIServer pid=1) DEBUG 04-22 15:13:13 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=283) INFO 04-22 15:13:13 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/07d551a6df0697f257924722aaa2124b2a7dd3a0ce7f90b96a420893bbcca842/rank_0_0/model -(EngineCore pid=283) INFO 04-22 15:13:13 [compilation/monitor.py:48] torch.compile took 10.16 s in total -(EngineCore pid=283) INFO 04-22 15:13:13 [compilation/monitor.py:76] Initial profiling/warmup run took 0.45 s -(EngineCore pid=283) INFO 04-22 15:13:19 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=283) DEBUG 04-22 15:13:19 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=283) INFO 04-22 15:13:19 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=283) DEBUG 04-22 15:13:19 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=283) DEBUG 04-22 15:13:19 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=283) DEBUG 04-22 15:13:19 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=283) DEBUG 04-22 15:13:19 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=283) DEBUG 04-22 15:13:19 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=283) DEBUG 04-22 15:13:19 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=283) DEBUG 04-22 15:13:19 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 122.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=283) DEBUG 04-22 15:13:19 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=283) DEBUG 04-22 15:13:19 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=283) DEBUG 04-22 15:13:19 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=283) DEBUG 04-22 15:13:19 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=283) DEBUG 04-22 15:13:19 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=283) DEBUG 04-22 15:13:19 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=283) DEBUG 04-22 15:13:19 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 262.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=283) DEBUG 04-22 15:13:20 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=283) INFO 04-22 15:13:20 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.84 GiB total -(EngineCore pid=283) DEBUG 04-22 15:13:20 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=283) DEBUG 04-22 15:13:20 [v1/worker/gpu_worker.py:430] Free memory after profiling: 65.49 GiB (total), 62.05 GiB (within requested) -(EngineCore pid=283) DEBUG 04-22 15:13:20 [v1/worker/gpu_worker.py:435] Memory profiling takes 17.55 seconds. Total non KV cache memory: 13.57GiB; torch peak memory increase: 0.77GiB; non-torch forward increase memory: 0.25GiB; weights memory: 12.56GiB. -(EngineCore pid=283) INFO 04-22 15:13:20 [v1/worker/gpu_worker.py:436] Available KV cache memory: 61.66 GiB -(EngineCore pid=283) INFO 04-22 15:13:20 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9606 to maintain the same effective KV cache size. -(EngineCore pid=283) INFO 04-22 15:13:20 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 126,256 tokens -(EngineCore pid=283) INFO 04-22 15:13:20 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 15.41x -(EngineCore pid=283) 2026-04-22 15:13:20,851 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=283) DEBUG 04-22 15:13:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=283) 2026-04-22 15:13:20,863 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=283) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:46:03 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:46:03 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 00:46:03 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:46:03 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 00:46:03 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 00:46:03 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model deepseek-ai/DeepSeek-V2-Lite-Chat -(APIServer pid=1) INFO 04-22 00:46:03 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 00:46:03 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:46:03 [entrypoints/utils.py:233] non-default args: {'model_tag': 'deepseek-ai/DeepSeek-V2-Lite-Chat', 'model': 'deepseek-ai/DeepSeek-V2-Lite-Chat', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 00:46:03 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) `rope_scaling`'s factor field must be a float >= 1, got 40 -(APIServer pid=1) `rope_scaling`'s beta_fast field must be a float, got 32 -(APIServer pid=1) `rope_scaling`'s beta_slow field must be a float, got 1 -(APIServer pid=1) DEBUG 04-22 00:46:04 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.deepseek_v2.DeepseekV2ForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 00:46:04 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0030133 secs -(APIServer pid=1) INFO 04-22 00:46:04 [config/model.py:549] Resolved architecture: DeepseekV2ForCausalLM -(APIServer pid=1) INFO 04-22 00:46:04 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 00:46:04 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 00:46:04 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 00:46:04 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 00:46:04 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 00:46:04 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 00:46:04 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 00:46:04 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 00:46:04 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 00:46:04 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:46:04 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:46:04 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 00:46:08 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:46:08 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:46:08 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:46:08 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:46:08 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:46:08 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:46:08 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:46:08 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:46:08 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:46:08 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:46:08 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:46:08 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:46:13 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=243) DEBUG 04-22 00:46:14 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 00:46:14 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=243) DEBUG 04-22 00:46:14 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/fed5a9fd-db47-4db7-9d5b-3619315b9d45'], outputs=['ipc:///tmp/61bf74bf-8446-40f3-9c7e-768ec2e40381'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=243) DEBUG 04-22 00:46:14 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=243) DEBUG 04-22 00:46:14 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=243) DEBUG 04-22 00:46:14 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=243) DEBUG 04-22 00:46:14 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=243) DEBUG 04-22 00:46:14 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=243) INFO 04-22 00:46:14 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='deepseek-ai/DeepSeek-V2-Lite-Chat', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-V2-Lite-Chat', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=deepseek-ai/DeepSeek-V2-Lite-Chat, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=243) DEBUG 04-22 00:46:15 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.7.30:43211 backend=nccl -(EngineCore pid=243) INFO 04-22 00:46:15 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.7.30:43211 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=243) DEBUG 04-22 00:46:15 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=243) INFO 04-22 00:46:15 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N/A -(EngineCore pid=243) DEBUG 04-22 00:46:15 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776818775.8321583, auto_measure=True -(EngineCore pid=243) DEBUG 04-22 00:46:15 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=243) DEBUG 04-22 00:46:15 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=243) DEBUG 04-22 00:46:15 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=243) DEBUG 04-22 00:46:15 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=243) DEBUG 04-22 00:46:15 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=243) DEBUG 04-22 00:46:15 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=243) DEBUG 04-22 00:46:15 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=243) DEBUG 04-22 00:46:16 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=243) INFO 04-22 00:46:16 [v1/worker/gpu_model_runner.py:4735] Starting to load model deepseek-ai/DeepSeek-V2-Lite-Chat... -(EngineCore pid=243) DEBUG 04-22 00:46:16 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=576, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=True, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {FLASHINFER_MLA: [compute capability not supported], FLASHMLA_SPARSE: [non-sparse not supported]}. -(EngineCore pid=243) INFO 04-22 00:46:16 [platforms/cuda.py:334] Using FLASH_ATTN_MLA attention backend out of potential backends: ['FLASH_ATTN_MLA', 'FLASHMLA', 'TRITON_MLA']. -(EngineCore pid=243) INFO 04-22 00:46:16 [model_executor/.../attention/mla_attention.py:2137] Using FlashAttention prefill for MLA -(EngineCore pid=243) INFO 04-22 00:46:16 [utils/deep_gemm.py:115] DeepGEMM E8M0 enabled on current platform. -(EngineCore pid=243) INFO 04-22 00:46:16 [model_executor/.../oracle/unquantized.py:186] Using TRITON backend for Unquantized MoE -(EngineCore pid=243) DEBUG 04-22 00:46:16 [model_executor/.../runner/default_moe_runner.py:240] Enabled separate cuda stream for MoE shared_experts -(EngineCore pid=243) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=243) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=243) DEBUG 04-22 00:46:16 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=243) DEBUG 04-22 00:46:16 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=243) DEBUG 04-22 00:46:16 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 82, 'mla_decode_concat_quant_fp8': 27, 'silu_and_mul': 27, 'fused_moe': 26, 'unquantized_fused_moe': 26, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=243) DEBUG 04-22 00:46:16 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=243) DEBUG 04-22 00:46:17 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00003-of-000004.safetensors', 'model-00002-of-000004.safetensors', 'model-00001-of-000004.safetensors', 'model-00004-of-000004.safetensors']] -(EngineCore pid=243) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/forward_context.py -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/mla_attention.py -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/config.py -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/shared_fused_moe.py -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/mla.py -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/deepseek_scaling_rope.py -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/deepseek_v2.py -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=243) INFO 04-22 00:46:49 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/3585710925/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=6e053dcd55 comp=e546579c48 code=3dff38c17fb0ff7fac2589dc3dae8f8ea483056a47cdbf285df4c2af5c769b39 dir=/data/.cache/vllm/torch_compile_cache/3585710925/rank_0_0/backbone -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/backends.py:1074] Vllm config hash: 6e053dcd55 -(EngineCore pid=243) INFO 04-22 00:46:49 [compilation/backends.py:1111] Dynamo bytecode transform time: 3.97 s -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=243) DEBUG 04-22 00:46:49 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=243) DEBUG 04-22 00:46:50 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 00:46:50 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(EngineCore pid=243) DEBUG 04-22 00:46:50 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms -(EngineCore pid=243) DEBUG 04-22 00:46:50 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 00:46:50 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=243) INFO 04-22 00:46:52 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=243) DEBUG 04-22 00:46:52 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/3585710925/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=243) DEBUG 04-22 00:46:52 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 00:46:52 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(EngineCore pid=243) DEBUG 04-22 00:46:52 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.3 ms -(EngineCore pid=243) DEBUG 04-22 00:46:52 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 00:46:52 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=243) DEBUG 04-22 00:46:53 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/3585710925/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=243) DEBUG 04-22 00:46:54 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 00:46:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(EngineCore pid=243) DEBUG 04-22 00:46:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms -(EngineCore pid=243) DEBUG 04-22 00:46:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 00:46:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(APIServer pid=1) DEBUG 04-22 00:46:54 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=243) DEBUG 04-22 00:46:55 [compilation/backends.py:377] Store the 2-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_2', '/data/.cache/vllm/torch_compile_cache/3585710925/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_2') -(EngineCore pid=243) DEBUG 04-22 00:46:56 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 00:46:56 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(EngineCore pid=243) DEBUG 04-22 00:46:56 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.7 ms -(EngineCore pid=243) DEBUG 04-22 00:46:56 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 00:46:56 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(EngineCore pid=243) DEBUG 04-22 00:46:56 [compilation/backends.py:377] Store the 27-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_27', '/data/.cache/vllm/torch_compile_cache/3585710925/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_27') -(EngineCore pid=243) INFO 04-22 00:46:56 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 7.34 s -(EngineCore pid=243) DEBUG 04-22 00:46:56 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/3585710925/rank_0_0/backbone/computation_graph.py -(EngineCore pid=243) INFO 04-22 00:46:57 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/48b804c1578e9a38cf099387383e564c087328bc0200b1c8bc2bdfac409a174b/rank_0_0/model -(EngineCore pid=243) INFO 04-22 00:46:57 [compilation/monitor.py:48] torch.compile took 12.33 s in total -(EngineCore pid=243) /usr/local/lib/python3.12/dist-packages/torch/_inductor/lowering.py:7627: UserWarning: -(EngineCore pid=243) Online softmax is disabled on the fly since Inductor decides to -(EngineCore pid=243) split the reduction. Cut an issue to PyTorch if this is an -(EngineCore pid=243) important use case and you want to speed it up with online -(EngineCore pid=243) softmax. -(EngineCore pid=243) -(EngineCore pid=243) warnings.warn( -(EngineCore pid=243) WARNING 04-22 00:46:59 [model_executor/.../fused_moe/fused_moe.py:1090] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=64,N=1408,device_name=NVIDIA_H100_80GB_HBM3.json -(EngineCore pid=243) INFO 04-22 00:47:00 [compilation/monitor.py:76] Initial profiling/warmup run took 3.11 s -(APIServer pid=1) DEBUG 04-22 00:47:04 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=243) INFO 04-22 00:47:06 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=243) DEBUG 04-22 00:47:06 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=243) INFO 04-22 00:47:06 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=243) DEBUG 04-22 00:47:06 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:47:06 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:47:06 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 00:47:06 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:47:06 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:47:06 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 00:47:06 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 106.00 MiB first-capture + (51-1) × 10.00 MiB per-graph -(EngineCore pid=243) DEBUG 04-22 00:47:06 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:47:06 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:47:06 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 00:47:06 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:47:06 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:47:06 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 00:47:07 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 522.00 MiB first-capture + (51-1) × 8.00 MiB per-graph -(EngineCore pid=243) DEBUG 04-22 00:47:07 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=243) INFO 04-22 00:47:07 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.39 GiB total -(EngineCore pid=243) DEBUG 04-22 00:47:07 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=243) DEBUG 04-22 00:47:07 [v1/worker/gpu_worker.py:430] Free memory after profiling: 48.34 GiB (total), 44.89 GiB (within requested) -(EngineCore pid=243) DEBUG 04-22 00:47:07 [v1/worker/gpu_worker.py:435] Memory profiling takes 22.62 seconds. Total non KV cache memory: 31.62GiB; torch peak memory increase: 1.93GiB; non-torch forward increase memory: 0.26GiB; weights memory: 29.43GiB. -(EngineCore pid=243) INFO 04-22 00:47:07 [v1/worker/gpu_worker.py:436] Available KV cache memory: 43.61 GiB -(EngineCore pid=243) INFO 04-22 00:47:07 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9675 to maintain the same effective KV cache size. -(EngineCore pid=243) INFO 04-22 00:47:07 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 1,505,552 tokens -(EngineCore pid=243) INFO 04-22 00:47:07 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 183.78x -(EngineCore pid=243) 2026-04-22 00:47:07,885 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=243) DEBUG 04-22 00:47:07 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) 2026-04-22 00:47:07,915 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=243) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00= 1, got 40 -(APIServer pid=1) `rope_scaling`'s beta_fast field must be a float, got 32 -(APIServer pid=1) `rope_scaling`'s beta_slow field must be a float, got 1 -(APIServer pid=1) INFO 04-22 00:47:21 [renderers/hf.py:314] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this. -(APIServer pid=1) DEBUG 04-22 00:47:21 [renderers/base.py:203] Chat template warmup completed in 0.695s -(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/openai/api_server.py:594] Starting vLLM server on http://0.0.0.0:8000 -(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:37] Available routes are: -(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /openapi.json, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /docs, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /docs/oauth2-redirect, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /redoc, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /tokenize, Methods: POST -(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /detokenize, Methods: POST -(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /load, Methods: GET -(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /version, Methods: GET -(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /health, Methods: GET -(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /metrics, Methods: GET -(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /v1/models, Methods: GET -(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /ping, Methods: GET -(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /ping, Methods: POST -(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /invocations, Methods: POST -(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /v1/chat/completions, Methods: POST -(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST -(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /v1/responses, Methods: POST -(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET -(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST -(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /v1/completions, Methods: POST -(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /v1/messages, Methods: POST -(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST -(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /inference/v1/generate, Methods: POST -(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /scale_elastic_ep, Methods: POST -(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST -(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /v1/chat/completions/render, Methods: POST -(APIServer pid=1) INFO 04-22 00:47:22 [entrypoints/launcher.py:46] Route: /v1/completions/render, Methods: POST -(APIServer pid=1) INFO: Started server process [1] -(APIServer pid=1) INFO: Waiting for application startup. -(APIServer pid=1) INFO: Application startup complete. -(APIServer pid=1) DEBUG 04-22 00:47:22 [v1/engine/async_llm.py:875] Called check_health. -(APIServer pid=1) INFO: 10.129.6.2:33478 - "GET /health HTTP/1.1" 200 OK diff --git a/accuracy/results/v0.19.0/logs/fp8dyn-llama-3-3-70b--h100-80gb--tp2pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/fp8dyn-llama-3-3-70b--h100-80gb--tp2pp1dp1--8192.log deleted file mode 100644 index aa082fdb..00000000 --- a/accuracy/results/v0.19.0/logs/fp8dyn-llama-3-3-70b--h100-80gb--tp2pp1dp1--8192.log +++ /dev/null @@ -1,2150 +0,0 @@ -DEBUG 04-22 00:16:05 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:16:05 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:16:05 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:16:05 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:16:05 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:16:05 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:16:05 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:16:05 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:16:05 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:16:05 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:16:05 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:16:05 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:16:10 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 00:16:12 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' -DEBUG 04-22 00:16:12 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 00:16:12 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:16:12 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:16:12 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 00:16:12 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:16:12 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 00:16:12 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 00:16:12 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic -(APIServer pid=1) INFO 04-22 00:16:12 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 00:16:12 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:16:12 [entrypoints/utils.py:233] non-default args: {'model_tag': 'RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic', 'model': 'RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic', 'max_model_len': 8192, 'tensor_parallel_size': 2, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 00:16:12 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 00:16:12 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 00:16:12 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003880 secs -(APIServer pid=1) INFO 04-22 00:16:12 [config/model.py:549] Resolved architecture: LlamaForCausalLM -(APIServer pid=1) INFO 04-22 00:16:12 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 00:16:13 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 00:16:13 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 00:16:13 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 00:16:13 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 00:16:13 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 00:16:13 [config/parallel.py:743] Defaulting to use mp for distributed inference -(APIServer pid=1) INFO 04-22 00:16:13 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 00:16:13 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(APIServer pid=1) INFO 04-22 00:16:14 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(APIServer pid=1) DEBUG 04-22 00:16:14 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 00:16:14 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:16:14 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:16:14 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 00:16:18 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:16:18 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:16:18 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:16:18 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:16:18 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:16:18 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:16:18 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:16:18 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:16:18 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:16:18 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:16:18 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:16:18 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:16:23 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(APIServer pid=1) DEBUG 04-22 00:16:24 [v1/engine/utils.py:1042] Waiting for 1 local, 0 remote core engine proc(s) to connect. -(EngineCore pid=245) DEBUG 04-22 00:16:25 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 00:16:25 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=245) DEBUG 04-22 00:16:25 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/8fd259a3-1a8f-44e1-b97e-568878a2a7bb'], outputs=['ipc:///tmp/95ad2f05-64c8-4329-b07c-dda78cdfb113'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=245) DEBUG 04-22 00:16:25 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=245) DEBUG 04-22 00:16:25 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=245) DEBUG 04-22 00:16:25 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=245) DEBUG 04-22 00:16:25 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=245) DEBUG 04-22 00:16:25 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=245) INFO 04-22 00:16:25 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic', speculative_config=None, tokenizer='RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [4096, 8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=245) WARNING 04-22 00:16:25 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. -(EngineCore pid=245) INFO 04-22 00:16:25 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.128.4.190 (local), world_size=2, local_world_size=2 -(EngineCore pid=245) DEBUG 04-22 00:16:25 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/586b3818-4b38-4ebe-9921-e3c3607e8f9d -(EngineCore pid=245) DEBUG 04-22 00:16:25 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1], buffer_handle=(2, 16777216, 10, 'psm_63d3722a'), local_subscribe_addr='ipc:///tmp/586b3818-4b38-4ebe-9921-e3c3607e8f9d', local_notify_addr='ipc:///tmp/1c30918c-7bce-4425-b471-562d86f610dd', remote_subscribe_addr=None, remote_addr_ipv6=False) -DEBUG 04-22 00:16:28 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:16:28 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:16:28 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:16:28 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:16:28 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:16:28 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:16:28 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:16:28 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:16:28 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:16:28 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:16:28 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:16:28 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:16:28 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:16:28 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:16:28 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:16:28 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:16:28 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:16:28 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:16:28 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:16:28 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:16:28 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:16:28 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:16:28 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:16:28 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:16:33 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 00:16:33 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 00:16:34 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 00:16:34 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:16:34 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:16:34 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 00:16:34 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 00:16:34 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:16:34 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:16:34 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) DEBUG 04-22 00:16:35 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker pid=444) DEBUG 04-22 00:16:35 [distributed/parallel_state.py:1356] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:36569 backend=nccl -(Worker pid=444) INFO 04-22 00:16:35 [distributed/parallel_state.py:1400] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:36569 backend=nccl -(Worker pid=445) DEBUG 04-22 00:16:35 [distributed/parallel_state.py:1356] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:36569 backend=nccl -(Worker pid=445) INFO 04-22 00:16:35 [distributed/parallel_state.py:1400] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:36569 backend=nccl -[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -(Worker pid=445) DEBUG 04-22 00:16:35 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=444) DEBUG 04-22 00:16:35 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -(Worker pid=445) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=445) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=444) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=444) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=444) DEBUG 04-22 00:16:35 [utils/nccl.py:34] Found nccl from library libnccl.so.2 -(Worker pid=444) INFO 04-22 00:16:35 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 -(Worker pid=445) DEBUG 04-22 00:16:36 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=444) DEBUG 04-22 00:16:36 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=444) DEBUG 04-22 00:16:36 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/2abe8c31-ff70-4e4d-90a9-e0ab7800b406 -(Worker pid=444) DEBUG 04-22 00:16:36 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1], buffer_handle=(1, 4194304, 6, 'psm_ff754c24'), local_subscribe_addr='ipc:///tmp/2abe8c31-ff70-4e4d-90a9-e0ab7800b406', local_notify_addr='ipc:///tmp/efd5d6a7-a676-41f7-a221-0335ab138965', remote_subscribe_addr=None, remote_addr_ipv6=False) -(Worker pid=445) DEBUG 04-22 00:16:36 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/2abe8c31-ff70-4e4d-90a9-e0ab7800b406 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(Worker pid=444) INFO 04-22 00:16:36 [distributed/parallel_state.py:1716] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(Worker pid=445) DEBUG 04-22 00:16:36 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.66GiB, total_memory=79.19GiB, cuda_memory=1.53GiB, torch_memory=0.02GiB, non_torch_memory=1.51GiB, timestamp=1776816996.768284, auto_measure=True -(Worker pid=445) DEBUG 04-22 00:16:36 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=444) DEBUG 04-22 00:16:36 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.66GiB, total_memory=79.19GiB, cuda_memory=1.53GiB, torch_memory=0.02GiB, non_torch_memory=1.51GiB, timestamp=1776816996.8347344, auto_measure=True -(Worker pid=444) DEBUG 04-22 00:16:36 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=445) DEBUG 04-22 00:16:36 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=445) DEBUG 04-22 00:16:36 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=445) DEBUG 04-22 00:16:36 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=445) DEBUG 04-22 00:16:36 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=445) DEBUG 04-22 00:16:36 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=444) DEBUG 04-22 00:16:36 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=444) DEBUG 04-22 00:16:36 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=444) DEBUG 04-22 00:16:36 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=444) DEBUG 04-22 00:16:36 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=444) DEBUG 04-22 00:16:36 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(Worker pid=444) DEBUG 04-22 00:16:36 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=444) DEBUG 04-22 00:16:37 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(Worker_TP0 pid=444) INFO 04-22 00:16:37 [v1/worker/gpu_model_runner.py:4735] Starting to load model RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic... -(Worker_TP0 pid=444) INFO 04-22 00:16:37 [model_executor/.../linear/__init__.py:261] Selected CutlassFP8ScaledMMLinearKernel for CompressedTensorsW8A8Fp8 -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.self_attn.o_proj -(Worker_TP0 pid=444) INFO 04-22 00:16:37 [utils/deep_gemm.py:115] DeepGEMM E8M0 enabled on current platform. -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(Worker_TP0 pid=444) INFO 04-22 00:16:37 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(Worker_TP0 pid=444) INFO 04-22 00:16:37 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:37 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.self_attn.qkv_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.self_attn.o_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.mlp.gate_up_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.mlp.down_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.self_attn.qkv_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.self_attn.o_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.mlp.gate_up_proj -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.mlp.down_proj -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [config/compilation.py:1195] disabled custom ops: Counter({'quant_fp8': 320, 'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [config/compilation.py:1195] disabled custom ops: Counter({'quant_fp8': 320, 'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP1 pid=445) DEBUG 04-22 00:16:38 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00013-of-00015.safetensors', 'model-00004-of-00015.safetensors', 'model-00006-of-00015.safetensors', 'model-00005-of-00015.safetensors', 'model-00009-of-00015.safetensors', 'model-00008-of-00015.safetensors', 'model-00014-of-00015.safetensors', 'model-00007-of-00015.safetensors', 'model-00012-of-00015.safetensors', 'model-00010-of-00015.safetensors', 'model-00011-of-00015.safetensors', 'model-00015-of-00015.safetensors', 'model-00002-of-00015.safetensors', 'model-00001-of-00015.safetensors', 'model-00003-of-00015.safetensors']] -(Worker_TP0 pid=444) DEBUG 04-22 00:16:38 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00014-of-00015.safetensors', 'model-00011-of-00015.safetensors', 'model-00001-of-00015.safetensors', 'model-00012-of-00015.safetensors', 'model-00009-of-00015.safetensors', 'model-00004-of-00015.safetensors', 'model-00013-of-00015.safetensors', 'model-00003-of-00015.safetensors', 'model-00008-of-00015.safetensors', 'model-00010-of-00015.safetensors', 'model-00007-of-00015.safetensors', 'model-00015-of-00015.safetensors', 'model-00005-of-00015.safetensors', 'model-00006-of-00015.safetensors', 'model-00002-of-00015.safetensors']] -(Worker_TP0 pid=444) Loading safetensors checkpoint shards: 0% Completed | 0/15 [00:00 -(Worker_TP0 pid=444) DEBUG 04-22 00:17:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 00:17:32 [compilation/decorators.py:528] Start compiling function -(EngineCore pid=245) DEBUG 04-22 00:17:33 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(APIServer pid=1) DEBUG 04-22 00:17:35 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 00:17:45 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/input_quant_fp8.py -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/utils/quant_utils.py -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/input_quant_fp8.py -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/utils/quant_utils.py -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=53000d16bd comp=e546579c48 code=f156083917c384d07960be9da52b3831ca85a8c6cca0b42b018e2920d91ddd8b dir=/data/.cache/vllm/torch_compile_cache/815459ed8e/rank_1_0/backbone -(Worker_TP0 pid=444) INFO 04-22 00:17:49 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/815459ed8e/rank_0_0/backbone for vLLM's torch.compile -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=53000d16bd comp=e546579c48 code=f156083917c384d07960be9da52b3831ca85a8c6cca0b42b018e2920d91ddd8b dir=/data/.cache/vllm/torch_compile_cache/815459ed8e/rank_0_0/backbone -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] Vllm config hash: 53000d16bd -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/backends.py:1074] Vllm config hash: 53000d16bd -(Worker_TP0 pid=444) INFO 04-22 00:17:49 [compilation/backends.py:1111] Dynamo bytecode transform time: 16.88 s -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [compilation/.../fusion/allreduce_rms_fusion.py:765] Flashinfer max size: 64 MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: 4096 -(Worker_TP0 pid=444) INFO 04-22 00:17:49 [distributed/device_communicators/flashinfer_all_reduce.py:109] Auto-selected flashinfer allreduce backend: trtllm -(Worker_TP0 pid=444) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. -(Worker_TP0 pid=444) return func(*args, **kwargs) -(Worker_TP0 pid=444) DEBUG 04-22 00:17:49 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=0, max_token_num=4096, hidden_dim=8192, dtype=torch.bfloat16 -(Worker_TP1 pid=445) DEBUG 04-22 00:17:49 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=1, max_token_num=4096, hidden_dim=8192, dtype=torch.bfloat16 -(Worker_TP0 pid=444) INFO 04-22 00:17:49 [distributed/device_communicators/flashinfer_all_reduce.py:149] Initialized FlashInfer Allreduce norm fusion workspace with backend=trtllm -(Worker_TP0 pid=444) DEBUG 04-22 00:17:50 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 4096), (4097, 8192)] -(Worker_TP0 pid=444) DEBUG 04-22 00:17:50 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(Worker_TP0 pid=444) DEBUG 04-22 00:17:51 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=444) DEBUG 04-22 00:17:51 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP1 pid=445) DEBUG 04-22 00:17:51 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=445) DEBUG 04-22 00:17:51 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP0 pid=444) DEBUG 04-22 00:17:51 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns -(Worker_TP0 pid=444) DEBUG 04-22 00:17:51 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 24.3 ms -(Worker_TP0 pid=444) DEBUG 04-22 00:17:51 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP0 pid=444) DEBUG 04-22 00:17:51 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes -(Worker_TP0 pid=444) DEBUG 04-22 00:17:51 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP1 pid=445) DEBUG 04-22 00:17:51 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns -(Worker_TP1 pid=445) DEBUG 04-22 00:17:51 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 24.2 ms -(Worker_TP1 pid=445) DEBUG 04-22 00:17:51 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP1 pid=445) DEBUG 04-22 00:17:51 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes -(Worker_TP1 pid=445) DEBUG 04-22 00:17:51 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP0 pid=444) INFO 04-22 00:17:54 [compilation/backends.py:372] Cache the graph of compile range (1, 4096) for later use -(Worker_TP0 pid=444) DEBUG 04-22 00:17:54 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 4096) from inductor_standalone via handle ('artifact_compile_range_1_4096_subgraph_0', '/data/.cache/vllm/torch_compile_cache/815459ed8e/rank_0_0/backbone/artifact_compile_range_1_4096_subgraph_0') -(Worker_TP0 pid=444) DEBUG 04-22 00:17:54 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=444) DEBUG 04-22 00:17:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP0 pid=444) DEBUG 04-22 00:17:54 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) -(Worker_TP0 pid=444) DEBUG 04-22 00:17:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP0 pid=444) DEBUG 04-22 00:17:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=444) DEBUG 04-22 00:17:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP1 pid=445) DEBUG 04-22 00:17:54 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=445) DEBUG 04-22 00:17:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP1 pid=445) DEBUG 04-22 00:17:54 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) -(Worker_TP1 pid=445) DEBUG 04-22 00:17:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.5 ms -(Worker_TP1 pid=445) DEBUG 04-22 00:17:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=445) DEBUG 04-22 00:17:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(APIServer pid=1) DEBUG 04-22 00:17:55 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=444) INFO 04-22 00:17:55 [compilation/backends.py:372] Cache the graph of compile range (4097, 8192) for later use -(Worker_TP0 pid=444) DEBUG 04-22 00:17:55 [compilation/backends.py:377] Store the 0-th graph for compile range(4097, 8192) from inductor_standalone via handle ('artifact_compile_range_4097_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/815459ed8e/rank_0_0/backbone/artifact_compile_range_4097_8192_subgraph_0') -(Worker_TP0 pid=444) DEBUG 04-22 00:17:56 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=444) DEBUG 04-22 00:17:56 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP1 pid=445) DEBUG 04-22 00:17:56 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=445) DEBUG 04-22 00:17:56 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP0 pid=444) DEBUG 04-22 00:17:56 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP0 pid=444) DEBUG 04-22 00:17:56 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.1 ms -(Worker_TP0 pid=444) DEBUG 04-22 00:17:56 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.5 ms -(Worker_TP0 pid=444) DEBUG 04-22 00:17:56 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP0 pid=444) DEBUG 04-22 00:17:56 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP1 pid=445) DEBUG 04-22 00:17:56 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP1 pid=445) DEBUG 04-22 00:17:56 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.5 ms -(Worker_TP1 pid=445) DEBUG 04-22 00:17:56 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.5 ms -(Worker_TP1 pid=445) DEBUG 04-22 00:17:56 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP1 pid=445) DEBUG 04-22 00:17:56 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP0 pid=444) DEBUG 04-22 00:17:58 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 4096) from inductor_standalone via handle ('artifact_compile_range_1_4096_subgraph_1', '/data/.cache/vllm/torch_compile_cache/815459ed8e/rank_0_0/backbone/artifact_compile_range_1_4096_subgraph_1') -(Worker_TP0 pid=444) DEBUG 04-22 00:17:59 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=444) DEBUG 04-22 00:17:59 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP0 pid=444) DEBUG 04-22 00:17:59 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) -(Worker_TP0 pid=444) DEBUG 04-22 00:17:59 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.5 ms -(Worker_TP0 pid=444) DEBUG 04-22 00:17:59 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=444) DEBUG 04-22 00:17:59 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.8 ms -(Worker_TP1 pid=445) DEBUG 04-22 00:17:59 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=445) DEBUG 04-22 00:17:59 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP1 pid=445) DEBUG 04-22 00:17:59 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) -(Worker_TP1 pid=445) DEBUG 04-22 00:17:59 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.5 ms -(Worker_TP1 pid=445) DEBUG 04-22 00:17:59 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=445) DEBUG 04-22 00:17:59 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=444) DEBUG 04-22 00:18:00 [compilation/backends.py:377] Store the 1-th graph for compile range(4097, 8192) from inductor_standalone via handle ('artifact_compile_range_4097_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/815459ed8e/rank_0_0/backbone/artifact_compile_range_4097_8192_subgraph_1') -(APIServer pid=1) DEBUG 04-22 00:18:05 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=444) DEBUG 04-22 00:18:06 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=444) DEBUG 04-22 00:18:06 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP1 pid=445) DEBUG 04-22 00:18:06 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=445) DEBUG 04-22 00:18:06 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP0 pid=444) DEBUG 04-22 00:18:06 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP0 pid=444) DEBUG 04-22 00:18:06 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.3 ms -(Worker_TP0 pid=444) DEBUG 04-22 00:18:06 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=444) DEBUG 04-22 00:18:06 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP0 pid=444) DEBUG 04-22 00:18:06 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP1 pid=445) DEBUG 04-22 00:18:06 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP1 pid=445) DEBUG 04-22 00:18:06 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.8 ms -(Worker_TP1 pid=445) DEBUG 04-22 00:18:06 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=445) DEBUG 04-22 00:18:06 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP1 pid=445) DEBUG 04-22 00:18:06 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP1 pid=445) DEBUG 04-22 00:18:07 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=445) DEBUG 04-22 00:18:07 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP1 pid=445) DEBUG 04-22 00:18:07 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) -(Worker_TP1 pid=445) DEBUG 04-22 00:18:07 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=445) DEBUG 04-22 00:18:07 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=445) DEBUG 04-22 00:18:07 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP0 pid=444) DEBUG 04-22 00:18:07 [compilation/backends.py:377] Store the 80-th graph for compile range(1, 4096) from inductor_standalone via handle ('artifact_compile_range_1_4096_subgraph_80', '/data/.cache/vllm/torch_compile_cache/815459ed8e/rank_0_0/backbone/artifact_compile_range_1_4096_subgraph_80') -(Worker_TP0 pid=444) INFO 04-22 00:18:07 [compilation/backends.py:390] Compiling a graph for compile range (1, 4096) takes 13.82 s -(Worker_TP0 pid=444) DEBUG 04-22 00:18:08 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=444) DEBUG 04-22 00:18:08 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP0 pid=444) DEBUG 04-22 00:18:08 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) -(Worker_TP0 pid=444) DEBUG 04-22 00:18:08 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=444) DEBUG 04-22 00:18:08 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=444) DEBUG 04-22 00:18:08 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP0 pid=444) DEBUG 04-22 00:18:09 [compilation/backends.py:377] Store the 80-th graph for compile range(4097, 8192) from inductor_standalone via handle ('artifact_compile_range_4097_8192_subgraph_80', '/data/.cache/vllm/torch_compile_cache/815459ed8e/rank_0_0/backbone/artifact_compile_range_4097_8192_subgraph_80') -(Worker_TP0 pid=444) INFO 04-22 00:18:09 [compilation/backends.py:390] Compiling a graph for compile range (4097, 8192) takes 15.00 s -(Worker_TP0 pid=444) DEBUG 04-22 00:18:09 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/815459ed8e/rank_0_0/backbone/computation_graph.py -(Worker_TP0 pid=444) INFO 04-22 00:18:13 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/46f5ec07a78f772bd5aa151dedfba97679c60533c06813f17f54411b7bba24b7/rank_0_0/model -(Worker_TP0 pid=444) INFO 04-22 00:18:13 [compilation/monitor.py:48] torch.compile took 41.44 s in total -(Worker_TP0 pid=444) INFO 04-22 00:18:14 [compilation/monitor.py:76] Initial profiling/warmup run took 0.91 s -(APIServer pid=1) DEBUG 04-22 00:18:15 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP1 pid=445) INFO 04-22 00:18:20 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP1 pid=445) DEBUG 04-22 00:18:20 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP1 pid=445) INFO 04-22 00:18:20 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_TP0 pid=444) INFO 04-22 00:18:20 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP0 pid=444) DEBUG 04-22 00:18:20 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP0 pid=444) INFO 04-22 00:18:20 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_TP1 pid=445) DEBUG 04-22 00:18:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 00:18:21 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=445) DEBUG 04-22 00:18:21 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=444) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 00:18:21 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=445) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 00:18:21 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=445) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 108.00 MiB first-capture + (51-1) × 20.00 MiB per-graph -(Worker_TP0 pid=444) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 108.00 MiB first-capture + (51-1) × 20.00 MiB per-graph -(Worker_TP1 pid=445) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 00:18:21 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=445) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 00:18:21 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=444) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 00:18:21 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=445) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 00:18:21 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=444) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 268.00 MiB first-capture + (51-1) × 12.00 MiB per-graph -(Worker_TP0 pid=444) INFO 04-22 00:18:21 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses -(Worker_TP1 pid=445) DEBUG 04-22 00:18:21 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 268.00 MiB first-capture + (51-1) × 12.00 MiB per-graph -(Worker_TP1 pid=445) INFO 04-22 00:18:21 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses -(Worker_TP1 pid=445) DEBUG 04-22 00:18:22 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP1 pid=445) INFO 04-22 00:18:22 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.82 GiB total -(Worker_TP0 pid=444) DEBUG 04-22 00:18:22 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP0 pid=444) INFO 04-22 00:18:22 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.82 GiB total -(Worker_TP1 pid=445) DEBUG 04-22 00:18:23 [v1/worker/gpu_worker.py:424] Initial free memory: 77.66 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP1 pid=445) DEBUG 04-22 00:18:23 [v1/worker/gpu_worker.py:430] Free memory after profiling: 41.35 GiB (total), 38.92 GiB (within requested) -(Worker_TP1 pid=445) DEBUG 04-22 00:18:23 [v1/worker/gpu_worker.py:435] Memory profiling takes 50.75 seconds. Total non KV cache memory: 37.95GiB; torch peak memory increase: 1.96GiB; non-torch forward increase memory: 2.1GiB; weights memory: 33.88GiB. -(Worker_TP1 pid=445) INFO 04-22 00:18:23 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9730 to maintain the same effective KV cache size. -(Worker_TP1 pid=445) DEBUG 04-22 00:18:23 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=444) DEBUG 04-22 00:18:23 [v1/worker/gpu_worker.py:424] Initial free memory: 77.66 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP0 pid=444) DEBUG 04-22 00:18:23 [v1/worker/gpu_worker.py:430] Free memory after profiling: 41.35 GiB (total), 38.92 GiB (within requested) -(Worker_TP0 pid=444) DEBUG 04-22 00:18:23 [v1/worker/gpu_worker.py:435] Memory profiling takes 50.74 seconds. Total non KV cache memory: 37.95GiB; torch peak memory increase: 1.96GiB; non-torch forward increase memory: 2.1GiB; weights memory: 33.88GiB. -(Worker_TP0 pid=444) INFO 04-22 00:18:23 [v1/worker/gpu_worker.py:436] Available KV cache memory: 37.28 GiB -(Worker_TP0 pid=444) INFO 04-22 00:18:23 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9730 to maintain the same effective KV cache size. -(Worker_TP0 pid=444) DEBUG 04-22 00:18:23 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=245) DEBUG 04-22 00:18:23 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=245) INFO 04-22 00:18:23 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 244,304 tokens -(EngineCore pid=245) INFO 04-22 00:18:23 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 29.82x -(Worker_TP0 pid=444) DEBUG 04-22 00:18:23 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=445) DEBUG 04-22 00:18:23 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=445) INFO 04-22 00:18:23 [v1/worker/gpu_worker.py:578] Compile and warming up model for size 8192 -(Worker_TP0 pid=444) INFO 04-22 00:18:23 [v1/worker/gpu_worker.py:578] Compile and warming up model for size 8192 -(Worker_TP0 pid=444) DEBUG 04-22 00:18:23 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 00:18:23 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) 2026-04-22 00:18:23,183 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP1 pid=445) 2026-04-22 00:18:23,183 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP0 pid=444) DEBUG 04-22 00:18:23 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 00:18:23 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) 2026-04-22 00:18:23,737 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP0 pid=444) 2026-04-22 00:18:23,737 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=245) DEBUG 04-22 00:18:24 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=444) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=245) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=245) INFO 04-22 00:18:33 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(Worker_TP0 pid=444) DEBUG 04-22 00:18:33 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=445) DEBUG 04-22 00:18:33 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=245) DEBUG 04-22 00:18:33 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=245) DEBUG 04-22 00:18:33 [v1/engine/core.py:1158] EngineCore waiting for work. -(EngineCore pid=245) DEBUG 04-22 00:18:33 [v1/engine/core.py:1158] EngineCore waiting for work. -(APIServer pid=1) INFO 04-22 00:18:33 [entrypoints/openai/api_server.py:590] Supported tasks: ['generate'] -(APIServer pid=1) WARNING 04-22 00:18:34 [config/model.py:1435] Default vLLM sampling parameters have been overridden by the model's `generation_config.json`: `{'temperature': 0.6, 'top_p': 0.9}`. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`. -(APIServer pid=1) DEBUG 04-22 00:18:34 [renderers/base.py:197] Warming up chat template processing... -(APIServer pid=1) DEBUG 04-22 00:18:34 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e813da-230200740f79100a4c5b90d4;a4ef458b-27aa-4ba5-8bb8-4615bcbc9448) -(APIServer pid=1) DEBUG 04-22 00:18:34 [transformers_utils/repo_utils.py:243] -(APIServer pid=1) DEBUG 04-22 00:18:34 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic/resolve/main/processor_config.json. -(APIServer pid=1) DEBUG 04-22 00:18:34 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e813da-115de46779174b0167c84f5d;86639a92-2492-4faf-9dd4-35b86bad219a) -(APIServer pid=1) DEBUG 04-22 00:18:34 [transformers_utils/repo_utils.py:243] -(APIServer pid=1) DEBUG 04-22 00:18:34 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic/resolve/main/preprocessor_config.json. -(Worker_TP1 pid=445) DEBUG 04-22 00:18:34 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=444) DEBUG 04-22 00:18:34 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(APIServer pid=1) INFO 04-22 00:18:35 [renderers/hf.py:314] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this. -(APIServer pid=1) DEBUG 04-22 00:18:35 [renderers/base.py:203] Chat template warmup completed in 1.424s -(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/openai/api_server.py:594] Starting vLLM server on http://0.0.0.0:8000 -(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:37] Available routes are: -(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /openapi.json, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /docs, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /docs/oauth2-redirect, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /redoc, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /tokenize, Methods: POST -(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /detokenize, Methods: POST -(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /load, Methods: GET -(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /version, Methods: GET -(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /health, Methods: GET -(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /metrics, Methods: GET -(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /v1/models, Methods: GET -(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /ping, Methods: GET -(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /ping, Methods: POST -(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /invocations, Methods: POST -(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /v1/chat/completions, Methods: POST -(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST -(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /v1/responses, Methods: POST -(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET -(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST -(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /v1/completions, Methods: POST -(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /v1/messages, Methods: POST -(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST -(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /inference/v1/generate, Methods: POST -(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /scale_elastic_ep, Methods: POST -(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST -(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /v1/chat/completions/render, Methods: POST -(APIServer pid=1) INFO 04-22 00:18:36 [entrypoints/launcher.py:46] Route: /v1/completions/render, Methods: POST -(APIServer pid=1) INFO: Started server process [1] -(APIServer pid=1) INFO: Waiting for application startup. -(APIServer pid=1) INFO: Application startup complete. -(APIServer pid=1) DEBUG 04-22 00:18:41 [v1/engine/async_llm.py:875] Called check_health. -(APIServer pid=1) INFO: 10.128.4.2:46564 - "GET /health HTTP/1.1" 200 OK diff --git a/accuracy/results/v0.19.0/logs/fp8dyn-llama-3-3-70b--h100-80gb--tp4pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/fp8dyn-llama-3-3-70b--h100-80gb--tp4pp1dp1--8192.log deleted file mode 100644 index b4c1e019..00000000 --- a/accuracy/results/v0.19.0/logs/fp8dyn-llama-3-3-70b--h100-80gb--tp4pp1dp1--8192.log +++ /dev/null @@ -1,4086 +0,0 @@ -DEBUG 04-22 00:18:46 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:18:46 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:18:46 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:18:46 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:18:46 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:18:46 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:18:46 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:18:46 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:18:46 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:18:46 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:18:46 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:18:46 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:18:51 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 00:18:52 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' -DEBUG 04-22 00:18:52 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 00:18:52 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:18:52 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:18:52 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 00:18:52 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:18:52 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 00:18:52 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 00:18:52 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic -(APIServer pid=1) INFO 04-22 00:18:52 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 00:18:52 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:18:52 [entrypoints/utils.py:233] non-default args: {'model_tag': 'RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic', 'model': 'RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic', 'max_model_len': 8192, 'tensor_parallel_size': 4, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 00:18:52 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 00:18:53 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 00:18:53 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003907 secs -(APIServer pid=1) INFO 04-22 00:18:53 [config/model.py:549] Resolved architecture: LlamaForCausalLM -(APIServer pid=1) INFO 04-22 00:18:53 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 00:18:53 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 00:18:53 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 00:18:53 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 00:18:53 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 00:18:53 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 00:18:53 [config/parallel.py:743] Defaulting to use mp for distributed inference -(APIServer pid=1) INFO 04-22 00:18:53 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 00:18:53 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(APIServer pid=1) INFO 04-22 00:18:55 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(APIServer pid=1) DEBUG 04-22 00:18:55 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 00:18:55 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:18:56 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:18:56 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 00:19:00 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:19:00 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:19:00 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:19:00 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:19:00 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:19:00 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:19:00 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:19:00 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:19:00 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:19:00 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:19:00 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:19:00 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:19:05 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(APIServer pid=1) DEBUG 04-22 00:19:06 [v1/engine/utils.py:1042] Waiting for 1 local, 0 remote core engine proc(s) to connect. -(EngineCore pid=243) DEBUG 04-22 00:19:06 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 00:19:06 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=243) DEBUG 04-22 00:19:06 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/759dcae2-a02d-4b3e-ae72-e93c4fadf53c'], outputs=['ipc:///tmp/9fb9ed15-dbde-4df5-92aa-a30bae7a9025'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=243) DEBUG 04-22 00:19:06 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=243) DEBUG 04-22 00:19:06 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=243) DEBUG 04-22 00:19:06 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=243) DEBUG 04-22 00:19:06 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=243) DEBUG 04-22 00:19:06 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=243) INFO 04-22 00:19:06 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic', speculative_config=None, tokenizer='RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=4, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [128, 8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=243) WARNING 04-22 00:19:06 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. -(EngineCore pid=243) INFO 04-22 00:19:06 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.128.4.191 (local), world_size=4, local_world_size=4 -(EngineCore pid=243) DEBUG 04-22 00:19:06 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/0924f52e-4b2c-4b6b-ad61-2dc2800e583c -(EngineCore pid=243) DEBUG 04-22 00:19:06 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1, 2, 3], buffer_handle=(4, 16777216, 10, 'psm_af2d89f2'), local_subscribe_addr='ipc:///tmp/0924f52e-4b2c-4b6b-ad61-2dc2800e583c', local_notify_addr='ipc:///tmp/81423ce9-3958-4734-a217-e6b87223c977', remote_subscribe_addr=None, remote_addr_ipv6=False) -DEBUG 04-22 00:19:10 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:19:10 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:19:10 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:19:10 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:19:10 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:19:10 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:19:10 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:19:10 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:19:10 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:19:10 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:19:10 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:19:10 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:19:10 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:19:10 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:19:10 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:19:10 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:19:10 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:19:10 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:19:10 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:19:10 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:19:10 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:19:10 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:19:10 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:19:10 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:19:10 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:19:10 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:19:10 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:19:10 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:19:10 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:19:10 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:19:10 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:19:10 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:19:10 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:19:10 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:19:10 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:19:10 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:19:10 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:19:10 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:19:10 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:19:10 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:19:10 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:19:10 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:19:10 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:19:10 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:19:10 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:19:10 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:19:10 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:19:10 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:19:15 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 00:19:15 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 00:19:15 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 00:19:15 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 00:19:16 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 00:19:16 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:19:16 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:19:16 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) DEBUG 04-22 00:19:16 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -DEBUG 04-22 00:19:16 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 00:19:16 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:19:16 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:19:16 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 00:19:16 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 00:19:16 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:19:16 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:19:16 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 00:19:17 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 00:19:17 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:19:17 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:19:17 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(Worker pid=444) DEBUG 04-22 00:19:17 [distributed/parallel_state.py:1356] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:39521 backend=nccl -(Worker pid=444) INFO 04-22 00:19:17 [distributed/parallel_state.py:1400] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:39521 backend=nccl -(Worker pid=442) DEBUG 04-22 00:19:17 [distributed/parallel_state.py:1356] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:39521 backend=nccl -(Worker pid=442) INFO 04-22 00:19:17 [distributed/parallel_state.py:1400] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:39521 backend=nccl -(Worker pid=443) DEBUG 04-22 00:19:17 [distributed/parallel_state.py:1356] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:39521 backend=nccl -(Worker pid=443) INFO 04-22 00:19:17 [distributed/parallel_state.py:1400] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:39521 backend=nccl -(Worker pid=445) DEBUG 04-22 00:19:17 [distributed/parallel_state.py:1356] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:39521 backend=nccl -(Worker pid=445) INFO 04-22 00:19:17 [distributed/parallel_state.py:1400] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:39521 backend=nccl -[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -(Worker pid=443) DEBUG 04-22 00:19:17 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=442) DEBUG 04-22 00:19:17 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=444) DEBUG 04-22 00:19:17 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=445) DEBUG 04-22 00:19:17 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -(Worker pid=442) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=444) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=442) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=444) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=443) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=443) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=442) DEBUG 04-22 00:19:18 [utils/nccl.py:34] Found nccl from library libnccl.so.2 -(Worker pid=442) INFO 04-22 00:19:18 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 -(Worker pid=445) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=445) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=444) DEBUG 04-22 00:19:20 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=443) DEBUG 04-22 00:19:20 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=445) DEBUG 04-22 00:19:20 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=442) DEBUG 04-22 00:19:20 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=442) DEBUG 04-22 00:19:20 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/4b28722e-c8bc-4a68-8db1-1fe76bb69325 -(Worker pid=442) DEBUG 04-22 00:19:20 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1, 2, 3], buffer_handle=(3, 4194304, 6, 'psm_ac4a1702'), local_subscribe_addr='ipc:///tmp/4b28722e-c8bc-4a68-8db1-1fe76bb69325', local_notify_addr='ipc:///tmp/260a4a4f-1f33-4aa9-bef9-48d4c19cb46c', remote_subscribe_addr=None, remote_addr_ipv6=False) -(Worker pid=444) DEBUG 04-22 00:19:20 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/4b28722e-c8bc-4a68-8db1-1fe76bb69325 -(Worker pid=443) DEBUG 04-22 00:19:20 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/4b28722e-c8bc-4a68-8db1-1fe76bb69325 -(Worker pid=445) DEBUG 04-22 00:19:20 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/4b28722e-c8bc-4a68-8db1-1fe76bb69325 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(Worker pid=442) INFO 04-22 00:19:20 [distributed/parallel_state.py:1716] rank 0 in world size 4 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(Worker pid=445) DEBUG 04-22 00:19:20 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776817160.5383635, auto_measure=True -(Worker pid=445) DEBUG 04-22 00:19:20 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=443) DEBUG 04-22 00:19:20 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776817160.5518556, auto_measure=True -(Worker pid=443) DEBUG 04-22 00:19:20 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=442) DEBUG 04-22 00:19:20 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776817160.553573, auto_measure=True -(Worker pid=442) DEBUG 04-22 00:19:20 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=444) DEBUG 04-22 00:19:20 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776817160.5982003, auto_measure=True -(Worker pid=444) DEBUG 04-22 00:19:20 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=445) DEBUG 04-22 00:19:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=445) DEBUG 04-22 00:19:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=445) DEBUG 04-22 00:19:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=442) DEBUG 04-22 00:19:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=442) DEBUG 04-22 00:19:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=443) DEBUG 04-22 00:19:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=443) DEBUG 04-22 00:19:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=442) DEBUG 04-22 00:19:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=443) DEBUG 04-22 00:19:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=445) DEBUG 04-22 00:19:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=442) DEBUG 04-22 00:19:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=445) DEBUG 04-22 00:19:20 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=443) DEBUG 04-22 00:19:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=444) DEBUG 04-22 00:19:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=442) DEBUG 04-22 00:19:20 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(Worker pid=444) DEBUG 04-22 00:19:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=442) DEBUG 04-22 00:19:20 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=444) DEBUG 04-22 00:19:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=443) DEBUG 04-22 00:19:20 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=444) DEBUG 04-22 00:19:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=444) DEBUG 04-22 00:19:20 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=442) DEBUG 04-22 00:19:20 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(Worker_TP0 pid=442) INFO 04-22 00:19:20 [v1/worker/gpu_model_runner.py:4735] Starting to load model RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic... -(Worker_TP3 pid=445) DEBUG 04-22 00:19:20 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:20 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.self_attn.o_proj -(Worker_TP0 pid=442) INFO 04-22 00:19:21 [model_executor/.../linear/__init__.py:261] Selected CutlassFP8ScaledMMLinearKernel for CompressedTensorsW8A8Fp8 -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.self_attn.o_proj -(Worker_TP0 pid=442) INFO 04-22 00:19:21 [utils/deep_gemm.py:115] DeepGEMM E8M0 enabled on current platform. -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(Worker_TP0 pid=442) INFO 04-22 00:19:21 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(Worker_TP0 pid=442) INFO 04-22 00:19:21 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.28.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.29.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.30.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.31.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.32.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.33.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.34.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.35.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.36.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.37.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.38.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.39.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.40.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.41.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.42.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.43.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.44.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.45.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.46.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.47.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.48.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.49.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.50.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.51.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.52.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.53.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.54.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.55.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.56.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.57.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.58.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.59.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.60.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.61.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.62.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.63.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.64.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.65.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.self_attn.qkv_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.mlp.gate_up_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.66.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:21 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.67.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.68.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.69.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.70.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.self_attn.o_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:22 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.71.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.mlp.gate_up_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.72.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.self_attn.qkv_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.self_attn.qkv_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.self_attn.o_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.mlp.gate_up_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:22 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP3 pid=445) DEBUG 04-22 00:19:22 [config/compilation.py:1195] disabled custom ops: Counter({'quant_fp8': 320, 'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.73.mlp.down_proj -(Worker_TP3 pid=445) DEBUG 04-22 00:19:22 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.74.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.75.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.76.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.self_attn.o_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.77.mlp.down_proj -(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.78.mlp.down_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.self_attn.qkv_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.self_attn.o_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.mlp.gate_up_proj -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.79.mlp.down_proj -(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [config/compilation.py:1195] disabled custom ops: Counter({'quant_fp8': 320, 'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [config/compilation.py:1195] disabled custom ops: Counter({'quant_fp8': 320, 'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [config/compilation.py:1195] disabled custom ops: Counter({'quant_fp8': 320, 'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP3 pid=445) DEBUG 04-22 00:19:22 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00010-of-00015.safetensors', 'model-00006-of-00015.safetensors', 'model-00007-of-00015.safetensors', 'model-00002-of-00015.safetensors', 'model-00011-of-00015.safetensors', 'model-00012-of-00015.safetensors', 'model-00003-of-00015.safetensors', 'model-00001-of-00015.safetensors', 'model-00009-of-00015.safetensors', 'model-00014-of-00015.safetensors', 'model-00004-of-00015.safetensors', 'model-00015-of-00015.safetensors', 'model-00013-of-00015.safetensors', 'model-00005-of-00015.safetensors', 'model-00008-of-00015.safetensors']] -(Worker_TP0 pid=442) DEBUG 04-22 00:19:22 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00011-of-00015.safetensors', 'model-00014-of-00015.safetensors', 'model-00009-of-00015.safetensors', 'model-00002-of-00015.safetensors', 'model-00010-of-00015.safetensors', 'model-00001-of-00015.safetensors', 'model-00004-of-00015.safetensors', 'model-00015-of-00015.safetensors', 'model-00005-of-00015.safetensors', 'model-00006-of-00015.safetensors', 'model-00007-of-00015.safetensors', 'model-00013-of-00015.safetensors', 'model-00012-of-00015.safetensors', 'model-00003-of-00015.safetensors', 'model-00008-of-00015.safetensors']] -(Worker_TP1 pid=443) DEBUG 04-22 00:19:22 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00006-of-00015.safetensors', 'model-00009-of-00015.safetensors', 'model-00002-of-00015.safetensors', 'model-00011-of-00015.safetensors', 'model-00013-of-00015.safetensors', 'model-00007-of-00015.safetensors', 'model-00008-of-00015.safetensors', 'model-00005-of-00015.safetensors', 'model-00012-of-00015.safetensors', 'model-00014-of-00015.safetensors', 'model-00010-of-00015.safetensors', 'model-00003-of-00015.safetensors', 'model-00015-of-00015.safetensors', 'model-00001-of-00015.safetensors', 'model-00004-of-00015.safetensors']] -(Worker_TP2 pid=444) DEBUG 04-22 00:19:22 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00012-of-00015.safetensors', 'model-00004-of-00015.safetensors', 'model-00005-of-00015.safetensors', 'model-00015-of-00015.safetensors', 'model-00002-of-00015.safetensors', 'model-00013-of-00015.safetensors', 'model-00001-of-00015.safetensors', 'model-00010-of-00015.safetensors', 'model-00014-of-00015.safetensors', 'model-00003-of-00015.safetensors', 'model-00008-of-00015.safetensors', 'model-00009-of-00015.safetensors', 'model-00011-of-00015.safetensors', 'model-00006-of-00015.safetensors', 'model-00007-of-00015.safetensors']] -(Worker_TP0 pid=442) Loading safetensors checkpoint shards: 0% Completed | 0/15 [00:00 -(Worker_TP0 pid=442) DEBUG 04-22 00:19:37 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) DEBUG 04-22 00:19:37 [compilation/decorators.py:528] Start compiling function -(Worker_TP2 pid=444) DEBUG 04-22 00:19:37 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=443) DEBUG 04-22 00:19:37 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=444) DEBUG 04-22 00:19:37 [compilation/decorators.py:528] Start compiling function -(Worker_TP1 pid=443) DEBUG 04-22 00:19:37 [compilation/decorators.py:528] Start compiling function -(APIServer pid=1) DEBUG 04-22 00:19:46 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/input_quant_fp8.py -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/utils/quant_utils.py -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/input_quant_fp8.py -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/utils/quant_utils.py -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=cb10c79acc comp=e546579c48 code=f156083917c384d07960be9da52b3831ca85a8c6cca0b42b018e2920d91ddd8b dir=/data/.cache/vllm/torch_compile_cache/f06e0c9df2/rank_3_0/backbone -(Worker_TP0 pid=442) INFO 04-22 00:19:54 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/f06e0c9df2/rank_0_0/backbone for vLLM's torch.compile -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=cb10c79acc comp=e546579c48 code=f156083917c384d07960be9da52b3831ca85a8c6cca0b42b018e2920d91ddd8b dir=/data/.cache/vllm/torch_compile_cache/f06e0c9df2/rank_0_0/backbone -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP3 pid=445) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] Vllm config hash: cb10c79acc -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] Vllm config hash: cb10c79acc -(Worker_TP0 pid=442) INFO 04-22 00:19:54 [compilation/backends.py:1111] Dynamo bytecode transform time: 17.21 s -(Worker_TP0 pid=442) DEBUG 04-22 00:19:54 [compilation/.../fusion/allreduce_rms_fusion.py:765] Flashinfer max size: 2 MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: 128 -(Worker_TP0 pid=442) INFO 04-22 00:19:54 [distributed/device_communicators/flashinfer_all_reduce.py:109] Auto-selected flashinfer allreduce backend: trtllm -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/input_quant_fp8.py -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/utils/quant_utils.py -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=cb10c79acc comp=e546579c48 code=f156083917c384d07960be9da52b3831ca85a8c6cca0b42b018e2920d91ddd8b dir=/data/.cache/vllm/torch_compile_cache/f06e0c9df2/rank_2_0/backbone -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP2 pid=444) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] Vllm config hash: cb10c79acc -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/input_quant_fp8.py -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/utils/quant_utils.py -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=cb10c79acc comp=e546579c48 code=f156083917c384d07960be9da52b3831ca85a8c6cca0b42b018e2920d91ddd8b dir=/data/.cache/vllm/torch_compile_cache/f06e0c9df2/rank_1_0/backbone -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP1 pid=443) DEBUG 04-22 00:19:54 [compilation/backends.py:1074] Vllm config hash: cb10c79acc -(Worker_TP0 pid=442) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. -(Worker_TP0 pid=442) return func(*args, **kwargs) -(Worker_TP1 pid=443) DEBUG 04-22 00:19:55 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=1, max_token_num=128, hidden_dim=8192, dtype=torch.bfloat16 -(Worker_TP2 pid=444) DEBUG 04-22 00:19:55 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=2, max_token_num=128, hidden_dim=8192, dtype=torch.bfloat16 -(Worker_TP0 pid=442) DEBUG 04-22 00:19:55 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=0, max_token_num=128, hidden_dim=8192, dtype=torch.bfloat16 -(Worker_TP3 pid=445) DEBUG 04-22 00:19:55 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=3, max_token_num=128, hidden_dim=8192, dtype=torch.bfloat16 -(Worker_TP0 pid=442) INFO 04-22 00:19:55 [distributed/device_communicators/flashinfer_all_reduce.py:149] Initialized FlashInfer Allreduce norm fusion workspace with backend=trtllm -(Worker_TP0 pid=442) DEBUG 04-22 00:19:56 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 128), (129, 8192)] -(Worker_TP0 pid=442) DEBUG 04-22 00:19:56 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(APIServer pid=1) DEBUG 04-22 00:19:56 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP1 pid=443) DEBUG 04-22 00:19:56 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=443) DEBUG 04-22 00:19:56 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:19:56 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns -(Worker_TP1 pid=443) DEBUG 04-22 00:19:56 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 24.7 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:19:56 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:19:56 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes -(Worker_TP1 pid=443) DEBUG 04-22 00:19:56 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP3 pid=445) DEBUG 04-22 00:19:56 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP3 pid=445) DEBUG 04-22 00:19:56 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP3 pid=445) DEBUG 04-22 00:19:56 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns -(Worker_TP3 pid=445) DEBUG 04-22 00:19:56 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 25.7 ms -(Worker_TP3 pid=445) DEBUG 04-22 00:19:56 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP3 pid=445) DEBUG 04-22 00:19:56 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes -(Worker_TP3 pid=445) DEBUG 04-22 00:19:56 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP2 pid=444) DEBUG 04-22 00:19:56 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP2 pid=444) DEBUG 04-22 00:19:56 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:19:56 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=442) DEBUG 04-22 00:19:56 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP2 pid=444) DEBUG 04-22 00:19:56 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns -(Worker_TP2 pid=444) DEBUG 04-22 00:19:56 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 25.0 ms -(Worker_TP2 pid=444) DEBUG 04-22 00:19:56 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP2 pid=444) DEBUG 04-22 00:19:56 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes -(Worker_TP2 pid=444) DEBUG 04-22 00:19:56 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:19:56 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns -(Worker_TP0 pid=442) DEBUG 04-22 00:19:56 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 25.1 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:19:56 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:19:56 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes -(Worker_TP0 pid=442) DEBUG 04-22 00:19:56 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP0 pid=442) INFO 04-22 00:19:59 [compilation/backends.py:372] Cache the graph of compile range (1, 128) for later use -(Worker_TP0 pid=442) DEBUG 04-22 00:19:59 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 128) from inductor_standalone via handle ('artifact_compile_range_1_128_subgraph_0', '/data/.cache/vllm/torch_compile_cache/f06e0c9df2/rank_0_0/backbone/artifact_compile_range_1_128_subgraph_0') -(Worker_TP1 pid=443) DEBUG 04-22 00:20:00 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=443) DEBUG 04-22 00:20:00 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:20:00 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) -(Worker_TP1 pid=443) DEBUG 04-22 00:20:00 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:20:00 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=443) DEBUG 04-22 00:20:00 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP2 pid=444) DEBUG 04-22 00:20:00 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP2 pid=444) DEBUG 04-22 00:20:00 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP2 pid=444) DEBUG 04-22 00:20:00 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) -(Worker_TP0 pid=442) DEBUG 04-22 00:20:00 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=442) DEBUG 04-22 00:20:00 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP2 pid=444) DEBUG 04-22 00:20:00 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:20:00 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) -(Worker_TP2 pid=444) DEBUG 04-22 00:20:00 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP2 pid=444) DEBUG 04-22 00:20:00 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:20:00 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:20:00 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=442) DEBUG 04-22 00:20:00 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP3 pid=445) DEBUG 04-22 00:20:00 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP3 pid=445) DEBUG 04-22 00:20:00 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP3 pid=445) DEBUG 04-22 00:20:00 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) -(Worker_TP3 pid=445) DEBUG 04-22 00:20:00 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP3 pid=445) DEBUG 04-22 00:20:00 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP3 pid=445) DEBUG 04-22 00:20:00 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=442) INFO 04-22 00:20:01 [compilation/backends.py:372] Cache the graph of compile range (129, 8192) for later use -(Worker_TP0 pid=442) DEBUG 04-22 00:20:01 [compilation/backends.py:377] Store the 0-th graph for compile range(129, 8192) from inductor_standalone via handle ('artifact_compile_range_129_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/f06e0c9df2/rank_0_0/backbone/artifact_compile_range_129_8192_subgraph_0') -(Worker_TP0 pid=442) DEBUG 04-22 00:20:02 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=442) DEBUG 04-22 00:20:02 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP2 pid=444) DEBUG 04-22 00:20:02 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP2 pid=444) DEBUG 04-22 00:20:02 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:20:02 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP0 pid=442) DEBUG 04-22 00:20:02 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.6 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:20:02 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.5 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:20:02 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP0 pid=442) DEBUG 04-22 00:20:02 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP2 pid=444) DEBUG 04-22 00:20:02 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP2 pid=444) DEBUG 04-22 00:20:02 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 40.2 ms -(Worker_TP2 pid=444) DEBUG 04-22 00:20:02 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.5 ms -(Worker_TP2 pid=444) DEBUG 04-22 00:20:02 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP2 pid=444) DEBUG 04-22 00:20:02 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP3 pid=445) DEBUG 04-22 00:20:02 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP3 pid=445) DEBUG 04-22 00:20:02 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms -(Worker_TP3 pid=445) DEBUG 04-22 00:20:02 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP3 pid=445) DEBUG 04-22 00:20:02 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 41.6 ms -(Worker_TP3 pid=445) DEBUG 04-22 00:20:02 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.5 ms -(Worker_TP3 pid=445) DEBUG 04-22 00:20:02 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP3 pid=445) DEBUG 04-22 00:20:02 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:20:03 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=443) DEBUG 04-22 00:20:03 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:20:03 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP1 pid=443) DEBUG 04-22 00:20:03 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 40.3 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:20:03 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.5 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:20:03 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP1 pid=443) DEBUG 04-22 00:20:03 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:20:04 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 128) from inductor_standalone via handle ('artifact_compile_range_1_128_subgraph_1', '/data/.cache/vllm/torch_compile_cache/f06e0c9df2/rank_0_0/backbone/artifact_compile_range_1_128_subgraph_1') -(Worker_TP0 pid=442) DEBUG 04-22 00:20:05 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=442) DEBUG 04-22 00:20:05 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:20:05 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) -(Worker_TP0 pid=442) DEBUG 04-22 00:20:05 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.5 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:20:05 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=443) DEBUG 04-22 00:20:05 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:20:05 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=443) DEBUG 04-22 00:20:05 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) -(Worker_TP0 pid=442) DEBUG 04-22 00:20:05 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:20:05 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.5 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:20:05 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=443) DEBUG 04-22 00:20:05 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP2 pid=444) DEBUG 04-22 00:20:05 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP2 pid=444) DEBUG 04-22 00:20:05 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP2 pid=444) DEBUG 04-22 00:20:05 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) -(Worker_TP2 pid=444) DEBUG 04-22 00:20:05 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.7 ms -(Worker_TP2 pid=444) DEBUG 04-22 00:20:05 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP2 pid=444) DEBUG 04-22 00:20:05 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP3 pid=445) DEBUG 04-22 00:20:05 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP3 pid=445) DEBUG 04-22 00:20:05 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms -(Worker_TP3 pid=445) DEBUG 04-22 00:20:05 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) -(Worker_TP3 pid=445) DEBUG 04-22 00:20:05 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.6 ms -(Worker_TP3 pid=445) DEBUG 04-22 00:20:05 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP3 pid=445) DEBUG 04-22 00:20:05 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:20:06 [compilation/backends.py:377] Store the 1-th graph for compile range(129, 8192) from inductor_standalone via handle ('artifact_compile_range_129_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/f06e0c9df2/rank_0_0/backbone/artifact_compile_range_129_8192_subgraph_1') -(APIServer pid=1) DEBUG 04-22 00:20:06 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=442) DEBUG 04-22 00:20:12 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=442) DEBUG 04-22 00:20:12 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:20:12 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP0 pid=442) DEBUG 04-22 00:20:12 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 40.6 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:20:12 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:20:12 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP0 pid=442) DEBUG 04-22 00:20:12 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:20:12 [compilation/backends.py:377] Store the 80-th graph for compile range(1, 128) from inductor_standalone via handle ('artifact_compile_range_1_128_subgraph_80', '/data/.cache/vllm/torch_compile_cache/f06e0c9df2/rank_0_0/backbone/artifact_compile_range_1_128_subgraph_80') -(Worker_TP0 pid=442) INFO 04-22 00:20:12 [compilation/backends.py:390] Compiling a graph for compile range (1, 128) takes 13.04 s -(Worker_TP0 pid=442) DEBUG 04-22 00:20:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=442) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:20:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=442) DEBUG 04-22 00:20:13 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) -(Worker_TP1 pid=443) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:20:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=442) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:20:13 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP1 pid=443) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.8 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:20:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP1 pid=443) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP2 pid=444) DEBUG 04-22 00:20:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP2 pid=444) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP2 pid=444) DEBUG 04-22 00:20:13 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP2 pid=444) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 40.6 ms -(Worker_TP2 pid=444) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP2 pid=444) DEBUG 04-22 00:20:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP2 pid=444) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP3 pid=445) DEBUG 04-22 00:20:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP3 pid=445) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP3 pid=445) DEBUG 04-22 00:20:13 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP3 pid=445) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.5 ms -(Worker_TP3 pid=445) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP3 pid=445) DEBUG 04-22 00:20:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP3 pid=445) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:20:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=443) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:20:13 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) -(Worker_TP1 pid=443) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:20:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=443) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP2 pid=444) DEBUG 04-22 00:20:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP2 pid=444) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP2 pid=444) DEBUG 04-22 00:20:13 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) -(Worker_TP2 pid=444) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP2 pid=444) DEBUG 04-22 00:20:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP2 pid=444) DEBUG 04-22 00:20:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:20:13 [compilation/backends.py:377] Store the 80-th graph for compile range(129, 8192) from inductor_standalone via handle ('artifact_compile_range_129_8192_subgraph_80', '/data/.cache/vllm/torch_compile_cache/f06e0c9df2/rank_0_0/backbone/artifact_compile_range_129_8192_subgraph_80') -(Worker_TP0 pid=442) INFO 04-22 00:20:13 [compilation/backends.py:390] Compiling a graph for compile range (129, 8192) takes 14.17 s -(Worker_TP3 pid=445) DEBUG 04-22 00:20:14 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP3 pid=445) DEBUG 04-22 00:20:14 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP3 pid=445) DEBUG 04-22 00:20:14 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) -(Worker_TP3 pid=445) DEBUG 04-22 00:20:14 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP3 pid=445) DEBUG 04-22 00:20:14 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP3 pid=445) DEBUG 04-22 00:20:14 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:20:14 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/f06e0c9df2/rank_0_0/backbone/computation_graph.py -(APIServer pid=1) DEBUG 04-22 00:20:16 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=442) INFO 04-22 00:20:19 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/484cf5817fb0bcdb65e1788392f05da3276f6ad8c6f5b65d8c5aef577cc4f5f9/rank_0_0/model -(Worker_TP0 pid=442) INFO 04-22 00:20:19 [compilation/monitor.py:48] torch.compile took 42.22 s in total -(Worker_TP0 pid=442) INFO 04-22 00:20:20 [compilation/monitor.py:76] Initial profiling/warmup run took 1.12 s -(Worker_TP1 pid=443) INFO 04-22 00:20:26 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP1 pid=443) DEBUG 04-22 00:20:26 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP1 pid=443) INFO 04-22 00:20:26 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_TP3 pid=445) INFO 04-22 00:20:26 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP3 pid=445) DEBUG 04-22 00:20:26 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP3 pid=445) INFO 04-22 00:20:26 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_TP0 pid=442) INFO 04-22 00:20:26 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP0 pid=442) DEBUG 04-22 00:20:26 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP0 pid=442) INFO 04-22 00:20:26 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(APIServer pid=1) DEBUG 04-22 00:20:26 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP2 pid=444) INFO 04-22 00:20:27 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP2 pid=444) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP2 pid=444) INFO 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_TP1 pid=443) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=445) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=444) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=444) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=443) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) DEBUG 04-22 00:20:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP2 pid=444) DEBUG 04-22 00:20:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=443) DEBUG 04-22 00:20:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP3 pid=445) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=445) DEBUG 04-22 00:20:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP2 pid=444) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=443) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=445) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=444) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=444) DEBUG 04-22 00:20:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=442) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) DEBUG 04-22 00:20:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=443) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=443) DEBUG 04-22 00:20:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP3 pid=445) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=445) DEBUG 04-22 00:20:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP2 pid=444) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 98.00 MiB first-capture + (51-1) × 18.00 MiB per-graph -(Worker_TP2 pid=444) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 98.00 MiB first-capture + (51-1) × 18.00 MiB per-graph -(Worker_TP0 pid=442) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=443) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 98.00 MiB first-capture + (51-1) × 18.00 MiB per-graph -(Worker_TP1 pid=443) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=445) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 98.00 MiB first-capture + (51-1) × 18.00 MiB per-graph -(Worker_TP3 pid=445) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) DEBUG 04-22 00:20:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=443) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=444) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=443) DEBUG 04-22 00:20:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP2 pid=444) DEBUG 04-22 00:20:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP3 pid=445) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=445) DEBUG 04-22 00:20:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP2 pid=444) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=443) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=445) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=444) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=444) DEBUG 04-22 00:20:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=442) DEBUG 04-22 00:20:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=443) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=443) DEBUG 04-22 00:20:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP3 pid=445) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=445) DEBUG 04-22 00:20:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP2 pid=444) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 142.00 MiB first-capture + (51-1) × 14.00 MiB per-graph -(Worker_TP2 pid=444) INFO 04-22 00:20:27 [distributed/device_communicators/custom_all_reduce.py:216] Registering 322 cuda graph addresses -(Worker_TP0 pid=442) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 142.00 MiB first-capture + (51-1) × 14.00 MiB per-graph -(Worker_TP0 pid=442) INFO 04-22 00:20:27 [distributed/device_communicators/custom_all_reduce.py:216] Registering 322 cuda graph addresses -(Worker_TP1 pid=443) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 142.00 MiB first-capture + (51-1) × 14.00 MiB per-graph -(Worker_TP1 pid=443) INFO 04-22 00:20:27 [distributed/device_communicators/custom_all_reduce.py:216] Registering 322 cuda graph addresses -(Worker_TP3 pid=445) DEBUG 04-22 00:20:27 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 142.00 MiB first-capture + (51-1) × 14.00 MiB per-graph -(Worker_TP3 pid=445) INFO 04-22 00:20:27 [distributed/device_communicators/custom_all_reduce.py:216] Registering 322 cuda graph addresses -(Worker_TP3 pid=445) DEBUG 04-22 00:20:28 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP3 pid=445) INFO 04-22 00:20:28 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.70 GiB total -(Worker_TP2 pid=444) DEBUG 04-22 00:20:28 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP1 pid=443) DEBUG 04-22 00:20:28 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP2 pid=444) INFO 04-22 00:20:28 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.70 GiB total -(Worker_TP1 pid=443) INFO 04-22 00:20:28 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.70 GiB total -(Worker_TP0 pid=442) DEBUG 04-22 00:20:28 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP0 pid=442) INFO 04-22 00:20:28 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.70 GiB total -(Worker_TP3 pid=445) DEBUG 04-22 00:20:29 [v1/worker/gpu_worker.py:424] Initial free memory: 77.64 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP3 pid=445) DEBUG 04-22 00:20:29 [v1/worker/gpu_worker.py:430] Free memory after profiling: 58.07 GiB (total), 55.66 GiB (within requested) -(Worker_TP3 pid=445) DEBUG 04-22 00:20:29 [v1/worker/gpu_worker.py:435] Memory profiling takes 52.07 seconds. Total non KV cache memory: 21.14GiB; torch peak memory increase: 1.97GiB; non-torch forward increase memory: 2.21GiB; weights memory: 16.96GiB. -(Worker_TP3 pid=445) INFO 04-22 00:20:29 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9715 to maintain the same effective KV cache size. -(Worker_TP3 pid=445) DEBUG 04-22 00:20:29 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP2 pid=444) DEBUG 04-22 00:20:29 [v1/worker/gpu_worker.py:424] Initial free memory: 77.64 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP2 pid=444) DEBUG 04-22 00:20:29 [v1/worker/gpu_worker.py:430] Free memory after profiling: 58.07 GiB (total), 55.66 GiB (within requested) -(Worker_TP2 pid=444) DEBUG 04-22 00:20:29 [v1/worker/gpu_worker.py:435] Memory profiling takes 52.09 seconds. Total non KV cache memory: 21.14GiB; torch peak memory increase: 1.97GiB; non-torch forward increase memory: 2.21GiB; weights memory: 16.96GiB. -(Worker_TP2 pid=444) INFO 04-22 00:20:29 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9715 to maintain the same effective KV cache size. -(Worker_TP2 pid=444) DEBUG 04-22 00:20:29 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=442) DEBUG 04-22 00:20:29 [v1/worker/gpu_worker.py:424] Initial free memory: 77.64 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP0 pid=442) DEBUG 04-22 00:20:29 [v1/worker/gpu_worker.py:430] Free memory after profiling: 58.07 GiB (total), 55.66 GiB (within requested) -(Worker_TP0 pid=442) DEBUG 04-22 00:20:29 [v1/worker/gpu_worker.py:435] Memory profiling takes 52.17 seconds. Total non KV cache memory: 21.14GiB; torch peak memory increase: 1.97GiB; non-torch forward increase memory: 2.21GiB; weights memory: 16.96GiB. -(Worker_TP0 pid=442) INFO 04-22 00:20:29 [v1/worker/gpu_worker.py:436] Available KV cache memory: 54.09 GiB -(Worker_TP0 pid=442) INFO 04-22 00:20:29 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9715 to maintain the same effective KV cache size. -(Worker_TP0 pid=442) DEBUG 04-22 00:20:29 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=243) DEBUG 04-22 00:20:29 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=243) DEBUG 04-22 00:20:29 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=443) DEBUG 04-22 00:20:29 [v1/worker/gpu_worker.py:424] Initial free memory: 77.64 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP1 pid=443) DEBUG 04-22 00:20:29 [v1/worker/gpu_worker.py:430] Free memory after profiling: 58.07 GiB (total), 55.66 GiB (within requested) -(Worker_TP1 pid=443) DEBUG 04-22 00:20:29 [v1/worker/gpu_worker.py:435] Memory profiling takes 52.12 seconds. Total non KV cache memory: 21.14GiB; torch peak memory increase: 1.97GiB; non-torch forward increase memory: 2.21GiB; weights memory: 16.96GiB. -(Worker_TP1 pid=443) INFO 04-22 00:20:29 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9715 to maintain the same effective KV cache size. -(EngineCore pid=243) DEBUG 04-22 00:20:29 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=443) DEBUG 04-22 00:20:29 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=243) INFO 04-22 00:20:29 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 708,992 tokens -(EngineCore pid=243) INFO 04-22 00:20:29 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 86.55x -(Worker_TP1 pid=443) DEBUG 04-22 00:20:29 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=442) DEBUG 04-22 00:20:29 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP2 pid=444) DEBUG 04-22 00:20:29 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP3 pid=445) DEBUG 04-22 00:20:29 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=442) 2026-04-22 00:20:29,522 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP3 pid=445) 2026-04-22 00:20:29,522 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP1 pid=443) 2026-04-22 00:20:29,522 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP2 pid=444) 2026-04-22 00:20:29,522 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP0 pid=442) DEBUG 04-22 00:20:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=443) DEBUG 04-22 00:20:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=444) DEBUG 04-22 00:20:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=445) DEBUG 04-22 00:20:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) 2026-04-22 00:20:29,542 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP1 pid=443) 2026-04-22 00:20:29,542 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP2 pid=444) 2026-04-22 00:20:29,543 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP3 pid=445) 2026-04-22 00:20:29,543 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP3 pid=445) DEBUG 04-22 00:20:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=444) DEBUG 04-22 00:20:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=443) DEBUG 04-22 00:20:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=243) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=243) INFO 04-22 00:20:38 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(Worker_TP0 pid=442) DEBUG 04-22 00:20:38 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP2 pid=444) DEBUG 04-22 00:20:38 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=443) DEBUG 04-22 00:20:38 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP3 pid=445) DEBUG 04-22 00:20:38 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=243) DEBUG 04-22 00:20:38 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=243) DEBUG 04-22 00:20:38 [v1/engine/core.py:1158] EngineCore waiting for work. -(EngineCore pid=243) DEBUG 04-22 00:20:38 [v1/engine/core.py:1158] EngineCore waiting for work. -(APIServer pid=1) INFO 04-22 00:20:38 [entrypoints/openai/api_server.py:590] Supported tasks: ['generate'] -(APIServer pid=1) WARNING 04-22 00:20:39 [config/model.py:1435] Default vLLM sampling parameters have been overridden by the model's `generation_config.json`: `{'temperature': 0.6, 'top_p': 0.9}`. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`. -(APIServer pid=1) DEBUG 04-22 00:20:39 [renderers/base.py:197] Warming up chat template processing... -(APIServer pid=1) DEBUG 04-22 00:20:39 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e81457-26a4c5e34aa79af7740f7281;feab3de1-de34-4d52-bd22-8ad50e42a3d4) -(APIServer pid=1) DEBUG 04-22 00:20:39 [transformers_utils/repo_utils.py:243] -(APIServer pid=1) DEBUG 04-22 00:20:39 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic/resolve/main/processor_config.json. -(APIServer pid=1) DEBUG 04-22 00:20:39 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e81457-1ed4e7002e0a45454ba03fe8;1cdc5161-22f2-439d-9115-dff957aff247) -(APIServer pid=1) DEBUG 04-22 00:20:39 [transformers_utils/repo_utils.py:243] -(APIServer pid=1) DEBUG 04-22 00:20:39 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic/resolve/main/preprocessor_config.json. -(Worker_TP2 pid=444) DEBUG 04-22 00:20:39 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=442) DEBUG 04-22 00:20:39 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP3 pid=445) DEBUG 04-22 00:20:39 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=443) DEBUG 04-22 00:20:39 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(APIServer pid=1) INFO 04-22 00:20:40 [renderers/hf.py:314] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this. -(APIServer pid=1) DEBUG 04-22 00:20:40 [renderers/base.py:203] Chat template warmup completed in 1.352s -(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/openai/api_server.py:594] Starting vLLM server on http://0.0.0.0:8000 -(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:37] Available routes are: -(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /openapi.json, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /docs, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /docs/oauth2-redirect, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /redoc, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /tokenize, Methods: POST -(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /detokenize, Methods: POST -(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /load, Methods: GET -(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /version, Methods: GET -(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /health, Methods: GET -(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /metrics, Methods: GET -(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /v1/models, Methods: GET -(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /ping, Methods: GET -(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /ping, Methods: POST -(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /invocations, Methods: POST -(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /v1/chat/completions, Methods: POST -(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST -(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /v1/responses, Methods: POST -(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET -(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST -(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /v1/completions, Methods: POST -(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /v1/messages, Methods: POST -(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST -(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /inference/v1/generate, Methods: POST -(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /scale_elastic_ep, Methods: POST -(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST -(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /v1/chat/completions/render, Methods: POST -(APIServer pid=1) INFO 04-22 00:20:41 [entrypoints/launcher.py:46] Route: /v1/completions/render, Methods: POST -(APIServer pid=1) INFO: Started server process [1] -(APIServer pid=1) INFO: Waiting for application startup. -(APIServer pid=1) INFO: Application startup complete. -(APIServer pid=1) DEBUG 04-22 00:20:41 [v1/engine/async_llm.py:875] Called check_health. -(APIServer pid=1) INFO: 10.128.4.2:55156 - "GET /health HTTP/1.1" 200 OK diff --git a/accuracy/results/v0.19.0/logs/fp8dyn-redhatai-qwen2-5---h100-80gb--tp1pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/fp8dyn-redhatai-qwen2-5---h100-80gb--tp1pp1dp1--8192.log deleted file mode 100644 index d62dbe3d..00000000 --- a/accuracy/results/v0.19.0/logs/fp8dyn-redhatai-qwen2-5---h100-80gb--tp1pp1dp1--8192.log +++ /dev/null @@ -1,864 +0,0 @@ -DEBUG 04-22 00:22:13 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:22:13 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:22:13 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:22:13 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:22:13 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:22:13 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:22:13 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:22:13 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:22:13 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:22:13 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:22:13 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:22:13 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:22:18 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 00:22:20 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' -DEBUG 04-22 00:22:20 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 00:22:20 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:22:20 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:22:20 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 00:22:20 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:22:20 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 00:22:20 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 00:22:20 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model RedHatAI/Qwen2.5-7B-Instruct-fp8-dynamic -(APIServer pid=1) INFO 04-22 00:22:20 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 00:22:20 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:22:20 [entrypoints/utils.py:233] non-default args: {'model_tag': 'RedHatAI/Qwen2.5-7B-Instruct-fp8-dynamic', 'model': 'RedHatAI/Qwen2.5-7B-Instruct-fp8-dynamic', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 00:22:20 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 00:22:20 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen2.Qwen2ForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 00:22:20 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0004829 secs -(APIServer pid=1) INFO 04-22 00:22:20 [config/model.py:549] Resolved architecture: Qwen2ForCausalLM -(APIServer pid=1) INFO 04-22 00:22:20 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 00:22:21 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 00:22:21 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 00:22:21 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 00:22:21 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 00:22:21 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 00:22:21 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 00:22:21 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 00:22:21 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 00:22:21 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:22:21 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:22:21 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 00:22:25 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:22:25 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:22:25 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:22:25 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:22:25 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:22:25 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:22:25 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:22:25 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:22:25 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:22:25 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:22:25 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:22:25 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:22:30 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(APIServer pid=1) DEBUG 04-22 00:22:32 [v1/engine/utils.py:1042] Waiting for 1 local, 0 remote core engine proc(s) to connect. -(EngineCore pid=242) DEBUG 04-22 00:22:32 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 00:22:32 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=242) DEBUG 04-22 00:22:32 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/a0babe96-5d72-4573-be1a-3d5bf9f4178d'], outputs=['ipc:///tmp/14c08b43-219a-4301-a691-68700f8de8bb'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=242) DEBUG 04-22 00:22:32 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=242) DEBUG 04-22 00:22:32 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=242) DEBUG 04-22 00:22:32 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=242) DEBUG 04-22 00:22:32 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=242) DEBUG 04-22 00:22:32 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=242) INFO 04-22 00:22:32 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='RedHatAI/Qwen2.5-7B-Instruct-fp8-dynamic', speculative_config=None, tokenizer='RedHatAI/Qwen2.5-7B-Instruct-fp8-dynamic', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=RedHatAI/Qwen2.5-7B-Instruct-fp8-dynamic, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=242) DEBUG 04-22 00:22:32 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.4.192:39497 backend=nccl -(EngineCore pid=242) INFO 04-22 00:22:32 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.4.192:39497 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=242) DEBUG 04-22 00:22:32 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=242) INFO 04-22 00:22:32 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=242) DEBUG 04-22 00:22:32 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776817352.8785338, auto_measure=True -(EngineCore pid=242) DEBUG 04-22 00:22:32 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=242) DEBUG 04-22 00:22:32 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=242) DEBUG 04-22 00:22:32 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=242) DEBUG 04-22 00:22:32 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=242) DEBUG 04-22 00:22:33 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=242) DEBUG 04-22 00:22:33 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=242) DEBUG 04-22 00:22:33 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=242) INFO 04-22 00:22:33 [v1/worker/gpu_model_runner.py:4735] Starting to load model RedHatAI/Qwen2.5-7B-Instruct-fp8-dynamic... -(EngineCore pid=242) INFO 04-22 00:22:33 [model_executor/.../linear/__init__.py:261] Selected CutlassFP8ScaledMMLinearKernel for CompressedTensorsW8A8Fp8 -(EngineCore pid=242) INFO 04-22 00:22:33 [utils/deep_gemm.py:115] DeepGEMM E8M0 enabled on current platform. -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.self_attn.qkv_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.self_attn.o_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=242) INFO 04-22 00:22:33 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=242) INFO 04-22 00:22:33 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.mlp.gate_up_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.0.mlp.down_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.self_attn.qkv_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.self_attn.o_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.mlp.gate_up_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.1.mlp.down_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.self_attn.qkv_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.self_attn.o_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.mlp.gate_up_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.2.mlp.down_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.self_attn.qkv_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.self_attn.o_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.mlp.gate_up_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.3.mlp.down_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.self_attn.qkv_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.self_attn.o_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.mlp.gate_up_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.4.mlp.down_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.self_attn.qkv_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.self_attn.o_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.mlp.gate_up_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.5.mlp.down_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.self_attn.qkv_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.self_attn.o_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.mlp.gate_up_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.6.mlp.down_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.self_attn.qkv_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.self_attn.o_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.mlp.gate_up_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.7.mlp.down_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.self_attn.qkv_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.self_attn.o_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.mlp.gate_up_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.8.mlp.down_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.self_attn.qkv_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.self_attn.o_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.mlp.gate_up_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.9.mlp.down_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.self_attn.qkv_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.self_attn.o_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.mlp.gate_up_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.10.mlp.down_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.self_attn.qkv_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.self_attn.o_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.mlp.gate_up_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.11.mlp.down_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.self_attn.qkv_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.self_attn.o_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.mlp.gate_up_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.12.mlp.down_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.self_attn.qkv_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.self_attn.o_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.mlp.gate_up_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.13.mlp.down_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.self_attn.qkv_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.self_attn.o_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.mlp.gate_up_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.14.mlp.down_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.self_attn.qkv_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.self_attn.o_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.mlp.gate_up_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.15.mlp.down_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.self_attn.qkv_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.self_attn.o_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.mlp.gate_up_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.16.mlp.down_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.self_attn.qkv_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.self_attn.o_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.mlp.gate_up_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.17.mlp.down_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.self_attn.qkv_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.self_attn.o_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.mlp.gate_up_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.18.mlp.down_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.self_attn.qkv_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.self_attn.o_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.mlp.gate_up_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.19.mlp.down_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.self_attn.qkv_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.self_attn.o_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.mlp.gate_up_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.20.mlp.down_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.self_attn.qkv_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.self_attn.o_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.mlp.gate_up_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.21.mlp.down_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.self_attn.qkv_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.self_attn.o_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.mlp.gate_up_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.22.mlp.down_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.self_attn.qkv_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.self_attn.o_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.mlp.gate_up_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.23.mlp.down_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.self_attn.qkv_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.self_attn.o_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.mlp.gate_up_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.24.mlp.down_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.self_attn.qkv_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.self_attn.o_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.mlp.gate_up_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.25.mlp.down_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.self_attn.qkv_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.self_attn.o_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.mlp.gate_up_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.26.mlp.down_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.self_attn.qkv_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.self_attn.o_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.mlp.gate_up_proj -(EngineCore pid=242) DEBUG 04-22 00:22:33 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Fp8 for model.layers.27.mlp.down_proj -(EngineCore pid=242) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=242) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=242) DEBUG 04-22 00:22:34 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=242) DEBUG 04-22 00:22:34 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=242) DEBUG 04-22 00:22:34 [config/compilation.py:1195] disabled custom ops: Counter({'quant_fp8': 112, 'rms_norm': 57, 'silu_and_mul': 28, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=242) DEBUG 04-22 00:22:34 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=242) DEBUG 04-22 00:22:34 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00002.safetensors', 'model-00001-of-00002.safetensors']] -(EngineCore pid=242) Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00 -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/input_quant_fp8.py -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/utils/quant_utils.py -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=242) INFO 04-22 00:22:49 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/71f6b61ae1/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=a0449d9b50 comp=e546579c48 code=850d7e0a2e6ea6d15823f3dba6f5b3cc98fdb412a0d3254bb67dce9fa1730fd3 dir=/data/.cache/vllm/torch_compile_cache/71f6b61ae1/rank_0_0/backbone -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/backends.py:1074] Vllm config hash: a0449d9b50 -(EngineCore pid=242) INFO 04-22 00:22:49 [compilation/backends.py:1111] Dynamo bytecode transform time: 6.75 s -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=242) DEBUG 04-22 00:22:49 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=242) DEBUG 04-22 00:22:50 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=242) DEBUG 04-22 00:22:50 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(EngineCore pid=242) DEBUG 04-22 00:22:50 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.0 ms -(EngineCore pid=242) DEBUG 04-22 00:22:50 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=242) DEBUG 04-22 00:22:50 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(APIServer pid=1) DEBUG 04-22 00:22:52 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=242) INFO 04-22 00:22:52 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=242) DEBUG 04-22 00:22:52 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/71f6b61ae1/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=242) DEBUG 04-22 00:22:53 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=242) DEBUG 04-22 00:22:53 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(EngineCore pid=242) DEBUG 04-22 00:22:53 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.8 ms -(EngineCore pid=242) DEBUG 04-22 00:22:53 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=242) DEBUG 04-22 00:22:53 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(EngineCore pid=242) DEBUG 04-22 00:22:55 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/71f6b61ae1/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=242) DEBUG 04-22 00:22:56 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=242) DEBUG 04-22 00:22:56 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.3 ms -(EngineCore pid=242) DEBUG 04-22 00:22:56 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.5 ms -(EngineCore pid=242) DEBUG 04-22 00:22:56 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=242) DEBUG 04-22 00:22:56 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(EngineCore pid=242) DEBUG 04-22 00:22:56 [compilation/backends.py:377] Store the 28-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_28', '/data/.cache/vllm/torch_compile_cache/71f6b61ae1/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_28') -(EngineCore pid=242) INFO 04-22 00:22:56 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 7.24 s -(EngineCore pid=242) DEBUG 04-22 00:22:57 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/71f6b61ae1/rank_0_0/backbone/computation_graph.py -(EngineCore pid=242) INFO 04-22 00:22:58 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/f4154a96ec3c2b706aaa7f98465c13eb99158e9860b8e6dad729e3b5d6402e64/rank_0_0/model -(EngineCore pid=242) INFO 04-22 00:22:58 [compilation/monitor.py:48] torch.compile took 16.09 s in total -(EngineCore pid=242) INFO 04-22 00:22:59 [compilation/monitor.py:76] Initial profiling/warmup run took 0.33 s -(APIServer pid=1) DEBUG 04-22 00:23:02 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=242) INFO 04-22 00:23:04 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=242) DEBUG 04-22 00:23:04 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=242) INFO 04-22 00:23:04 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=242) DEBUG 04-22 00:23:04 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=242) DEBUG 04-22 00:23:04 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=242) DEBUG 04-22 00:23:04 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=242) DEBUG 04-22 00:23:04 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=242) DEBUG 04-22 00:23:04 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=242) DEBUG 04-22 00:23:04 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=242) DEBUG 04-22 00:23:04 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 108.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=242) DEBUG 04-22 00:23:04 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=242) DEBUG 04-22 00:23:04 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=242) DEBUG 04-22 00:23:04 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=242) DEBUG 04-22 00:23:05 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=242) DEBUG 04-22 00:23:05 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=242) DEBUG 04-22 00:23:05 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=242) DEBUG 04-22 00:23:05 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 228.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(EngineCore pid=242) DEBUG 04-22 00:23:05 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=242) INFO 04-22 00:23:05 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.71 GiB total -(EngineCore pid=242) DEBUG 04-22 00:23:05 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=242) DEBUG 04-22 00:23:05 [v1/worker/gpu_worker.py:430] Free memory after profiling: 69.06 GiB (total), 65.62 GiB (within requested) -(EngineCore pid=242) DEBUG 04-22 00:23:05 [v1/worker/gpu_worker.py:435] Memory profiling takes 23.23 seconds. Total non KV cache memory: 10.59GiB; torch peak memory increase: 2.21GiB; non-torch forward increase memory: 0.24GiB; weights memory: 8.14GiB. -(EngineCore pid=242) INFO 04-22 00:23:05 [v1/worker/gpu_worker.py:436] Available KV cache memory: 64.64 GiB -(EngineCore pid=242) INFO 04-22 00:23:05 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9590 to maintain the same effective KV cache size. -(EngineCore pid=242) INFO 04-22 00:23:05 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 1,210,304 tokens -(EngineCore pid=242) INFO 04-22 00:23:05 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 147.74x -(EngineCore pid=242) 2026-04-22 00:23:05,885 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=242) DEBUG 04-22 00:23:05 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=242) 2026-04-22 00:23:05,894 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=242) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 15:50:06 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 15:50:06 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 15:50:06 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 15:50:06 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 15:50:06 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 15:50:06 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model google/gemma-2-27b-it -(APIServer pid=1) INFO 04-22 15:50:06 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 15:50:06 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 15:50:06 [entrypoints/utils.py:233] non-default args: {'model_tag': 'google/gemma-2-27b-it', 'model': 'google/gemma-2-27b-it', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 15:50:06 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) Traceback (most recent call last): -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_http.py", line 403, in hf_raise_for_status -(APIServer pid=1) response.raise_for_status() -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/requests/models.py", line 1028, in raise_for_status -(APIServer pid=1) raise HTTPError(http_error_msg, response=self) -(APIServer pid=1) requests.exceptions.HTTPError: 403 Client Error: Forbidden for url: https://huggingface.co/google/gemma-2-27b-it/resolve/main/config.json -(APIServer pid=1) -(APIServer pid=1) The above exception was the direct cause of the following exception: -(APIServer pid=1) -(APIServer pid=1) Traceback (most recent call last): -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1550, in _get_metadata_or_catch_error -(APIServer pid=1) metadata = get_hf_file_metadata( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn -(APIServer pid=1) return fn(*args, **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1467, in get_hf_file_metadata -(APIServer pid=1) r = _request_wrapper( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 283, in _request_wrapper -(APIServer pid=1) response = _request_wrapper( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 307, in _request_wrapper -(APIServer pid=1) hf_raise_for_status(response) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_http.py", line 467, in hf_raise_for_status -(APIServer pid=1) raise _format(HfHubHTTPError, message, response) from e -(APIServer pid=1) huggingface_hub.errors.HfHubHTTPError: (Request ID: Root=1-69e8ee2e-545ab1ad15853b3475ad62dd;fd90b524-9997-45ee-982d-17bb6e249315) -(APIServer pid=1) -(APIServer pid=1) 403 Forbidden: Please enable access to public gated repositories in your fine-grained token settings to view this repository.. -(APIServer pid=1) Cannot access content at: https://huggingface.co/google/gemma-2-27b-it/resolve/main/config.json. -(APIServer pid=1) Make sure your token has the correct permissions. -(APIServer pid=1) -(APIServer pid=1) The above exception was the direct cause of the following exception: -(APIServer pid=1) -(APIServer pid=1) Traceback (most recent call last): -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 479, in cached_files -(APIServer pid=1) hf_hub_download( -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn -(APIServer pid=1) return fn(*args, **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1014, in hf_hub_download -(APIServer pid=1) return _hf_hub_download_to_cache_dir( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1121, in _hf_hub_download_to_cache_dir -(APIServer pid=1) _raise_on_head_call_error(head_call_error, force_download, local_files_only) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1665, in _raise_on_head_call_error -(APIServer pid=1) raise LocalEntryNotFoundError( -(APIServer pid=1) huggingface_hub.errors.LocalEntryNotFoundError: An error happened while trying to locate the file on the Hub and we cannot find the requested files in the local cache. Please check your connection and try again or make sure your Internet connection is on. -(APIServer pid=1) -(APIServer pid=1) The above exception was the direct cause of the following exception: -(APIServer pid=1) -(APIServer pid=1) Traceback (most recent call last): -(APIServer pid=1) File "/usr/local/bin/vllm", line 10, in -(APIServer pid=1) sys.exit(main()) -(APIServer pid=1) ^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main -(APIServer pid=1) args.dispatch_function(args) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd -(APIServer pid=1) uvloop.run(run_server(args)) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run -(APIServer pid=1) return __asyncio.run( -(APIServer pid=1) ^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run -(APIServer pid=1) return runner.run(main) -(APIServer pid=1) ^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run -(APIServer pid=1) return self._loop.run_until_complete(task) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper -(APIServer pid=1) return await main -(APIServer pid=1) ^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server -(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker -(APIServer pid=1) async with build_async_engine_client( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ -(APIServer pid=1) return await anext(self.gen) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client -(APIServer pid=1) async with build_async_engine_client_from_engine_args( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ -(APIServer pid=1) return await anext(self.gen) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 124, in build_async_engine_client_from_engine_args -(APIServer pid=1) vllm_config = engine_args.create_engine_config(usage_context=usage_context) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/engine/arg_utils.py", line 1539, in create_engine_config -(APIServer pid=1) maybe_override_with_speculators( -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/transformers_utils/config.py", line 583, in maybe_override_with_speculators -(APIServer pid=1) config_dict, _ = PretrainedConfig.get_config_dict( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py", line 662, in get_config_dict -(APIServer pid=1) config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py", line 721, in _get_config_dict -(APIServer pid=1) resolved_config_file = cached_file( -(APIServer pid=1) ^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 322, in cached_file -(APIServer pid=1) file = cached_files(path_or_repo_id=path_or_repo_id, filenames=[filename], **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 553, in cached_files -(APIServer pid=1) raise OSError( -(APIServer pid=1) OSError: We couldn't connect to 'https://huggingface.co' to load the files, and couldn't find them in the cached files. -(APIServer pid=1) Check your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'. diff --git a/accuracy/results/v0.19.0/logs/google-gemma-2-27b-it--h100-80gb--tp1pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/google-gemma-2-27b-it--h100-80gb--tp1pp1dp1--8192.log deleted file mode 100644 index c955da1e..00000000 --- a/accuracy/results/v0.19.0/logs/google-gemma-2-27b-it--h100-80gb--tp1pp1dp1--8192.log +++ /dev/null @@ -1,770 +0,0 @@ -DEBUG 04-22 15:58:01 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 15:58:01 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 15:58:01 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 15:58:01 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 15:58:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 15:58:01 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 15:58:01 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 15:58:01 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 15:58:01 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 15:58:01 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 15:58:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 15:58:01 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 15:58:05 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 15:58:07 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' -DEBUG 04-22 15:58:07 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 15:58:07 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 15:58:07 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 15:58:07 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 15:58:07 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 15:58:07 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 15:58:07 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 15:58:07 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model google/gemma-2-27b-it -(APIServer pid=1) INFO 04-22 15:58:07 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 15:58:07 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 15:58:07 [entrypoints/utils.py:233] non-default args: {'model_tag': 'google/gemma-2-27b-it', 'model': 'google/gemma-2-27b-it', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 15:58:07 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 15:58:08 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.gemma2.Gemma2ForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 15:58:08 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003551 secs -(APIServer pid=1) INFO 04-22 15:58:08 [config/model.py:549] Resolved architecture: Gemma2ForCausalLM -(APIServer pid=1) INFO 04-22 15:58:08 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 15:58:08 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 15:58:08 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 15:58:08 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 15:58:08 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 15:58:08 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 15:58:08 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 15:58:08 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 15:58:08 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 15:58:08 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 15:58:10 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 15:58:10 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 15:58:14 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 15:58:14 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 15:58:14 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 15:58:14 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 15:58:14 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 15:58:14 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 15:58:14 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 15:58:14 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 15:58:14 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 15:58:14 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 15:58:14 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 15:58:14 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 15:58:19 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=286) DEBUG 04-22 15:58:20 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 15:58:20 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=286) DEBUG 04-22 15:58:20 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/e4f62627-5c43-4e86-9af9-f2733a970dd3'], outputs=['ipc:///tmp/5010f7f7-7c1c-4ad2-ae08-9526b73a064c'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=286) DEBUG 04-22 15:58:20 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=286) DEBUG 04-22 15:58:20 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=286) DEBUG 04-22 15:58:20 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=286) DEBUG 04-22 15:58:20 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=286) DEBUG 04-22 15:58:20 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=286) INFO 04-22 15:58:20 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='google/gemma-2-27b-it', speculative_config=None, tokenizer='google/gemma-2-27b-it', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=google/gemma-2-27b-it, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=286) DEBUG 04-22 15:58:21 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.10.216:57701 backend=nccl -(EngineCore pid=286) INFO 04-22 15:58:21 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.10.216:57701 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=286) DEBUG 04-22 15:58:21 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=286) INFO 04-22 15:58:21 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=286) DEBUG 04-22 15:58:21 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776873501.6531677, auto_measure=True -(EngineCore pid=286) DEBUG 04-22 15:58:21 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=286) DEBUG 04-22 15:58:21 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=286) DEBUG 04-22 15:58:21 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=286) DEBUG 04-22 15:58:21 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=286) DEBUG 04-22 15:58:21 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=286) DEBUG 04-22 15:58:21 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=286) DEBUG 04-22 15:58:21 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=286) DEBUG 04-22 15:58:21 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=286) DEBUG 04-22 15:58:21 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=286) INFO 04-22 15:58:21 [v1/worker/gpu_model_runner.py:4735] Starting to load model google/gemma-2-27b-it... -(EngineCore pid=286) DEBUG 04-22 15:58:22 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=286) INFO 04-22 15:58:22 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=286) INFO 04-22 15:58:22 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=286) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=286) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=286) DEBUG 04-22 15:58:22 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=286) DEBUG 04-22 15:58:22 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=286) DEBUG 04-22 15:58:22 [config/compilation.py:1195] disabled custom ops: Counter({'gemma_rms_norm': 185, 'gelu_and_mul': 46, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'logits_processor': 1}) -(EngineCore pid=286) DEBUG 04-22 15:58:22 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=286) DEBUG 04-22 15:58:22 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00011-of-00012.safetensors', 'model-00003-of-00012.safetensors', 'model-00006-of-00012.safetensors', 'model-00001-of-00012.safetensors', 'model-00009-of-00012.safetensors', 'model-00002-of-00012.safetensors', 'model-00010-of-00012.safetensors', 'model-00012-of-00012.safetensors', 'model-00008-of-00012.safetensors', 'model-00005-of-00012.safetensors', 'model-00004-of-00012.safetensors', 'model-00007-of-00012.safetensors']] -(APIServer pid=1) DEBUG 04-22 15:58:30 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 15:58:40 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 15:58:50 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 15:59:00 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 15:59:10 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 15:59:20 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 15:59:30 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 15:59:40 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 15:59:50 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 16:00:00 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 16:00:10 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 16:00:20 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=286) INFO 04-22 16:00:22 [model_executor/model_loader/weight_utils.py:581] Time spent downloading weights for google/gemma-2-27b-it: 120.123986 seconds -(EngineCore pid=286) Loading safetensors checkpoint shards: 0% Completed | 0/12 [00:00 -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/gemma2.py -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=286) INFO 04-22 16:01:17 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/8070fc1ef5/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=a866ad9f99 comp=e546579c48 code=1effc1a117077c34d7781a48a74d1e9a6c34076dbd4b7937695d68b6a3d23a2c dir=/data/.cache/vllm/torch_compile_cache/8070fc1ef5/rank_0_0/backbone -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/backends.py:1074] Vllm config hash: a866ad9f99 -(EngineCore pid=286) INFO 04-22 16:01:17 [compilation/backends.py:1111] Dynamo bytecode transform time: 6.19 s -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=286) DEBUG 04-22 16:01:17 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=286) DEBUG 04-22 16:01:18 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=286) DEBUG 04-22 16:01:18 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms -(EngineCore pid=286) DEBUG 04-22 16:01:18 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms -(EngineCore pid=286) DEBUG 04-22 16:01:18 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=286) DEBUG 04-22 16:01:18 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=286) INFO 04-22 16:01:20 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=286) DEBUG 04-22 16:01:20 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/8070fc1ef5/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(APIServer pid=1) DEBUG 04-22 16:01:20 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=286) DEBUG 04-22 16:01:21 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=286) DEBUG 04-22 16:01:21 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(EngineCore pid=286) DEBUG 04-22 16:01:21 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.7 ms -(EngineCore pid=286) DEBUG 04-22 16:01:21 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=286) DEBUG 04-22 16:01:21 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=286) DEBUG 04-22 16:01:22 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/8070fc1ef5/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=286) DEBUG 04-22 16:01:23 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=286) DEBUG 04-22 16:01:23 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.4 ms -(EngineCore pid=286) DEBUG 04-22 16:01:23 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.5 ms -(EngineCore pid=286) DEBUG 04-22 16:01:23 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=286) DEBUG 04-22 16:01:23 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(EngineCore pid=286) DEBUG 04-22 16:01:24 [compilation/backends.py:377] Store the 46-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_46', '/data/.cache/vllm/torch_compile_cache/8070fc1ef5/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_46') -(EngineCore pid=286) INFO 04-22 16:01:24 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 6.31 s -(EngineCore pid=286) DEBUG 04-22 16:01:24 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/8070fc1ef5/rank_0_0/backbone/computation_graph.py -(EngineCore pid=286) INFO 04-22 16:01:26 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/eb9d3a1e33941e4c476db9ae83727bbefc20c4ff28627d8fa1c23be91a2ee24c/rank_0_0/model -(EngineCore pid=286) INFO 04-22 16:01:26 [compilation/monitor.py:48] torch.compile took 14.67 s in total -(EngineCore pid=286) INFO 04-22 16:01:26 [compilation/monitor.py:76] Initial profiling/warmup run took 0.28 s -(APIServer pid=1) DEBUG 04-22 16:01:30 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=286) INFO 04-22 16:01:32 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=286) DEBUG 04-22 16:01:32 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=286) INFO 04-22 16:01:32 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=286) DEBUG 04-22 16:01:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=286) DEBUG 04-22 16:01:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=286) DEBUG 04-22 16:01:32 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=286) DEBUG 04-22 16:01:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=286) DEBUG 04-22 16:01:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=286) DEBUG 04-22 16:01:32 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=286) DEBUG 04-22 16:01:32 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 212.00 MiB first-capture + (51-1) × 10.00 MiB per-graph -(EngineCore pid=286) DEBUG 04-22 16:01:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=286) DEBUG 04-22 16:01:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=286) DEBUG 04-22 16:01:32 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=286) DEBUG 04-22 16:01:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=286) DEBUG 04-22 16:01:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=286) DEBUG 04-22 16:01:32 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=286) DEBUG 04-22 16:01:32 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 262.00 MiB first-capture + (51-1) × 8.00 MiB per-graph -(EngineCore pid=286) DEBUG 04-22 16:01:33 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=286) INFO 04-22 16:01:33 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.13 GiB total -(EngineCore pid=286) DEBUG 04-22 16:01:33 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=286) DEBUG 04-22 16:01:33 [v1/worker/gpu_worker.py:430] Free memory after profiling: 25.35 GiB (total), 21.91 GiB (within requested) -(EngineCore pid=286) DEBUG 04-22 16:01:33 [v1/worker/gpu_worker.py:435] Memory profiling takes 22.38 seconds. Total non KV cache memory: 54.64GiB; torch peak memory increase: 3.66GiB; non-torch forward increase memory: 0.26GiB; weights memory: 50.72GiB. -(EngineCore pid=286) INFO 04-22 16:01:33 [v1/worker/gpu_worker.py:436] Available KV cache memory: 20.59 GiB -(EngineCore pid=286) INFO 04-22 16:01:33 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9643 to maintain the same effective KV cache size. -(EngineCore pid=286) INFO 04-22 16:01:33 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 58,672 tokens -(EngineCore pid=286) INFO 04-22 16:01:33 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 7.16x -(EngineCore pid=286) 2026-04-22 16:01:33,734 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=286) DEBUG 04-22 16:01:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=286) 2026-04-22 16:01:33,745 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=286) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 15:49:40 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 15:49:40 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 15:49:40 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 15:49:40 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 15:49:40 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 15:49:40 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model google/gemma-2-2b-it -(APIServer pid=1) INFO 04-22 15:49:40 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 15:49:40 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 15:49:40 [entrypoints/utils.py:233] non-default args: {'model_tag': 'google/gemma-2-2b-it', 'model': 'google/gemma-2-2b-it', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 15:49:40 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) Traceback (most recent call last): -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_http.py", line 403, in hf_raise_for_status -(APIServer pid=1) response.raise_for_status() -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/requests/models.py", line 1028, in raise_for_status -(APIServer pid=1) raise HTTPError(http_error_msg, response=self) -(APIServer pid=1) requests.exceptions.HTTPError: 403 Client Error: Forbidden for url: https://huggingface.co/google/gemma-2-2b-it/resolve/main/config.json -(APIServer pid=1) -(APIServer pid=1) The above exception was the direct cause of the following exception: -(APIServer pid=1) -(APIServer pid=1) Traceback (most recent call last): -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1550, in _get_metadata_or_catch_error -(APIServer pid=1) metadata = get_hf_file_metadata( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn -(APIServer pid=1) return fn(*args, **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1467, in get_hf_file_metadata -(APIServer pid=1) r = _request_wrapper( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 283, in _request_wrapper -(APIServer pid=1) response = _request_wrapper( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 307, in _request_wrapper -(APIServer pid=1) hf_raise_for_status(response) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_http.py", line 467, in hf_raise_for_status -(APIServer pid=1) raise _format(HfHubHTTPError, message, response) from e -(APIServer pid=1) huggingface_hub.errors.HfHubHTTPError: (Request ID: Root=1-69e8ee14-22d7cb9163602c5941ebd21d;57340d59-f182-4b7d-a919-4768efc8392f) -(APIServer pid=1) -(APIServer pid=1) 403 Forbidden: Please enable access to public gated repositories in your fine-grained token settings to view this repository.. -(APIServer pid=1) Cannot access content at: https://huggingface.co/google/gemma-2-2b-it/resolve/main/config.json. -(APIServer pid=1) Make sure your token has the correct permissions. -(APIServer pid=1) -(APIServer pid=1) The above exception was the direct cause of the following exception: -(APIServer pid=1) -(APIServer pid=1) Traceback (most recent call last): -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 479, in cached_files -(APIServer pid=1) hf_hub_download( -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn -(APIServer pid=1) return fn(*args, **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1014, in hf_hub_download -(APIServer pid=1) return _hf_hub_download_to_cache_dir( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1121, in _hf_hub_download_to_cache_dir -(APIServer pid=1) _raise_on_head_call_error(head_call_error, force_download, local_files_only) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1665, in _raise_on_head_call_error -(APIServer pid=1) raise LocalEntryNotFoundError( -(APIServer pid=1) huggingface_hub.errors.LocalEntryNotFoundError: An error happened while trying to locate the file on the Hub and we cannot find the requested files in the local cache. Please check your connection and try again or make sure your Internet connection is on. -(APIServer pid=1) -(APIServer pid=1) The above exception was the direct cause of the following exception: -(APIServer pid=1) -(APIServer pid=1) Traceback (most recent call last): -(APIServer pid=1) File "/usr/local/bin/vllm", line 10, in -(APIServer pid=1) sys.exit(main()) -(APIServer pid=1) ^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main -(APIServer pid=1) args.dispatch_function(args) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd -(APIServer pid=1) uvloop.run(run_server(args)) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run -(APIServer pid=1) return __asyncio.run( -(APIServer pid=1) ^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run -(APIServer pid=1) return runner.run(main) -(APIServer pid=1) ^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run -(APIServer pid=1) return self._loop.run_until_complete(task) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper -(APIServer pid=1) return await main -(APIServer pid=1) ^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server -(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker -(APIServer pid=1) async with build_async_engine_client( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ -(APIServer pid=1) return await anext(self.gen) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client -(APIServer pid=1) async with build_async_engine_client_from_engine_args( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ -(APIServer pid=1) return await anext(self.gen) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 124, in build_async_engine_client_from_engine_args -(APIServer pid=1) vllm_config = engine_args.create_engine_config(usage_context=usage_context) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/engine/arg_utils.py", line 1539, in create_engine_config -(APIServer pid=1) maybe_override_with_speculators( -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/transformers_utils/config.py", line 583, in maybe_override_with_speculators -(APIServer pid=1) config_dict, _ = PretrainedConfig.get_config_dict( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py", line 662, in get_config_dict -(APIServer pid=1) config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py", line 721, in _get_config_dict -(APIServer pid=1) resolved_config_file = cached_file( -(APIServer pid=1) ^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 322, in cached_file -(APIServer pid=1) file = cached_files(path_or_repo_id=path_or_repo_id, filenames=[filename], **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 553, in cached_files -(APIServer pid=1) raise OSError( -(APIServer pid=1) OSError: We couldn't connect to 'https://huggingface.co' to load the files, and couldn't find them in the cached files. -(APIServer pid=1) Check your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'. diff --git a/accuracy/results/v0.19.0/logs/google-gemma-2-2b-it--h100-80gb--tp1pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/google-gemma-2-2b-it--h100-80gb--tp1pp1dp1--8192.log deleted file mode 100644 index 4133bbe3..00000000 --- a/accuracy/results/v0.19.0/logs/google-gemma-2-2b-it--h100-80gb--tp1pp1dp1--8192.log +++ /dev/null @@ -1,745 +0,0 @@ -DEBUG 04-22 15:55:00 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 15:55:00 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 15:55:00 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 15:55:00 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 15:55:00 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 15:55:00 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 15:55:00 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 15:55:00 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 15:55:00 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 15:55:00 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 15:55:00 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 15:55:00 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 15:55:04 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 15:55:06 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' -DEBUG 04-22 15:55:06 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 15:55:06 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 15:55:06 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 15:55:06 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 15:55:06 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 15:55:06 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 15:55:06 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 15:55:06 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model google/gemma-2-2b-it -(APIServer pid=1) INFO 04-22 15:55:06 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 15:55:06 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 15:55:06 [entrypoints/utils.py:233] non-default args: {'model_tag': 'google/gemma-2-2b-it', 'model': 'google/gemma-2-2b-it', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 15:55:06 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 15:55:07 [model_executor/models/registry.py:774] Cached model info file for class vllm.model_executor.models.gemma2.Gemma2ForCausalLM not found -(APIServer pid=1) DEBUG 04-22 15:55:07 [model_executor/models/registry.py:834] Cache model info for class vllm.model_executor.models.gemma2.Gemma2ForCausalLM miss. Loading model instead. -(APIServer pid=1) DEBUG 04-22 15:55:16 [model_executor/models/registry.py:844] Loaded model info for class vllm.model_executor.models.gemma2.Gemma2ForCausalLM -(APIServer pid=1) DEBUG 04-22 15:55:16 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 9.7262713 secs -(APIServer pid=1) INFO 04-22 15:55:16 [config/model.py:549] Resolved architecture: Gemma2ForCausalLM -(APIServer pid=1) INFO 04-22 15:55:16 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 15:55:16 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 15:55:16 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 15:55:16 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 15:55:16 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 15:55:16 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 15:55:16 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 15:55:16 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 15:55:16 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 15:55:16 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 15:55:19 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 15:55:19 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 15:55:22 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 15:55:22 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 15:55:22 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 15:55:22 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 15:55:22 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 15:55:22 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 15:55:22 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 15:55:22 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 15:55:22 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 15:55:22 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 15:55:22 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 15:55:22 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 15:55:27 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=474) DEBUG 04-22 15:55:29 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 15:55:29 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=474) DEBUG 04-22 15:55:29 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/7d536ea1-41ff-4939-a7ea-83235282434d'], outputs=['ipc:///tmp/13c3abbe-776f-4b4b-8481-36eeaa00e050'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=474) DEBUG 04-22 15:55:29 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=474) DEBUG 04-22 15:55:29 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=474) DEBUG 04-22 15:55:29 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=474) DEBUG 04-22 15:55:29 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=474) DEBUG 04-22 15:55:29 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=474) INFO 04-22 15:55:29 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='google/gemma-2-2b-it', speculative_config=None, tokenizer='google/gemma-2-2b-it', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=google/gemma-2-2b-it, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=474) DEBUG 04-22 15:55:29 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.10.209:49295 backend=nccl -(EngineCore pid=474) INFO 04-22 15:55:29 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.10.209:49295 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=474) DEBUG 04-22 15:55:30 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=474) INFO 04-22 15:55:30 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=474) DEBUG 04-22 15:55:30 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776873330.4498482, auto_measure=True -(EngineCore pid=474) DEBUG 04-22 15:55:30 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=474) DEBUG 04-22 15:55:30 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=474) DEBUG 04-22 15:55:30 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=474) DEBUG 04-22 15:55:30 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=474) DEBUG 04-22 15:55:30 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=474) DEBUG 04-22 15:55:30 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=474) DEBUG 04-22 15:55:30 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=474) DEBUG 04-22 15:55:30 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=474) DEBUG 04-22 15:55:30 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=474) INFO 04-22 15:55:30 [v1/worker/gpu_model_runner.py:4735] Starting to load model google/gemma-2-2b-it... -(EngineCore pid=474) DEBUG 04-22 15:55:31 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=256, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=474) INFO 04-22 15:55:31 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=474) INFO 04-22 15:55:31 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=474) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=474) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=474) DEBUG 04-22 15:55:31 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=474) DEBUG 04-22 15:55:31 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=474) DEBUG 04-22 15:55:31 [config/compilation.py:1195] disabled custom ops: Counter({'gemma_rms_norm': 105, 'gelu_and_mul': 26, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'logits_processor': 1}) -(EngineCore pid=474) DEBUG 04-22 15:55:31 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=474) DEBUG 04-22 15:55:31 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00002.safetensors', 'model-00001-of-00002.safetensors']] -(APIServer pid=1) DEBUG 04-22 15:55:39 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=474) INFO 04-22 15:55:40 [model_executor/model_loader/weight_utils.py:581] Time spent downloading weights for google/gemma-2-2b-it: 8.440384 seconds -(EngineCore pid=474) Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00 -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/gemma2.py -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=474) INFO 04-22 15:55:49 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/92cf196b29/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=9e1fbf4885 comp=e546579c48 code=1effc1a117077c34d7781a48a74d1e9a6c34076dbd4b7937695d68b6a3d23a2c dir=/data/.cache/vllm/torch_compile_cache/92cf196b29/rank_0_0/backbone -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/backends.py:1074] Vllm config hash: 9e1fbf4885 -(EngineCore pid=474) INFO 04-22 15:55:49 [compilation/backends.py:1111] Dynamo bytecode transform time: 3.85 s -(APIServer pid=1) DEBUG 04-22 15:55:49 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=474) DEBUG 04-22 15:55:49 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=474) DEBUG 04-22 15:55:50 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=474) DEBUG 04-22 15:55:50 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms -(EngineCore pid=474) DEBUG 04-22 15:55:50 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.0 ms -(EngineCore pid=474) DEBUG 04-22 15:55:50 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=474) DEBUG 04-22 15:55:50 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=474) INFO 04-22 15:55:51 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=474) DEBUG 04-22 15:55:51 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/92cf196b29/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=474) DEBUG 04-22 15:55:52 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=474) DEBUG 04-22 15:55:52 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(EngineCore pid=474) DEBUG 04-22 15:55:52 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.4 ms -(EngineCore pid=474) DEBUG 04-22 15:55:52 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=474) DEBUG 04-22 15:55:52 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=474) DEBUG 04-22 15:55:54 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/92cf196b29/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=474) DEBUG 04-22 15:55:54 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=474) DEBUG 04-22 15:55:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.2 ms -(EngineCore pid=474) DEBUG 04-22 15:55:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.3 ms -(EngineCore pid=474) DEBUG 04-22 15:55:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=474) DEBUG 04-22 15:55:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(EngineCore pid=474) DEBUG 04-22 15:55:55 [compilation/backends.py:377] Store the 26-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_26', '/data/.cache/vllm/torch_compile_cache/92cf196b29/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_26') -(EngineCore pid=474) INFO 04-22 15:55:55 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.97 s -(EngineCore pid=474) DEBUG 04-22 15:55:55 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/92cf196b29/rank_0_0/backbone/computation_graph.py -(EngineCore pid=474) INFO 04-22 15:55:56 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/1d940999550d016d5904ae05d31d0ec822db5c3ea1c63a9eedf126a4d6b58248/rank_0_0/model -(EngineCore pid=474) INFO 04-22 15:55:56 [compilation/monitor.py:48] torch.compile took 11.46 s in total -(EngineCore pid=474) INFO 04-22 15:55:57 [compilation/monitor.py:76] Initial profiling/warmup run took 0.28 s -(APIServer pid=1) DEBUG 04-22 15:55:59 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=474) INFO 04-22 15:56:02 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=474) DEBUG 04-22 15:56:02 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=474) INFO 04-22 15:56:02 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=474) DEBUG 04-22 15:56:03 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=474) DEBUG 04-22 15:56:03 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=474) DEBUG 04-22 15:56:03 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=474) DEBUG 04-22 15:56:03 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=474) DEBUG 04-22 15:56:03 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=474) DEBUG 04-22 15:56:03 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=474) DEBUG 04-22 15:56:03 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 118.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(EngineCore pid=474) DEBUG 04-22 15:56:03 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=474) DEBUG 04-22 15:56:03 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=474) DEBUG 04-22 15:56:03 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=474) DEBUG 04-22 15:56:03 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=474) DEBUG 04-22 15:56:03 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=474) DEBUG 04-22 15:56:03 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=474) DEBUG 04-22 15:56:03 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 134.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(EngineCore pid=474) DEBUG 04-22 15:56:03 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=474) INFO 04-22 15:56:03 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.52 GiB total -(EngineCore pid=474) DEBUG 04-22 15:56:03 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=474) DEBUG 04-22 15:56:03 [v1/worker/gpu_worker.py:430] Free memory after profiling: 72.29 GiB (total), 68.84 GiB (within requested) -(EngineCore pid=474) DEBUG 04-22 15:56:03 [v1/worker/gpu_worker.py:435] Memory profiling takes 18.59 seconds. Total non KV cache memory: 8.76GiB; torch peak memory increase: 3.62GiB; non-torch forward increase memory: 0.24GiB; weights memory: 4.9GiB. -(EngineCore pid=474) INFO 04-22 15:56:03 [v1/worker/gpu_worker.py:436] Available KV cache memory: 66.47 GiB -(EngineCore pid=474) INFO 04-22 15:56:03 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9566 to maintain the same effective KV cache size. -(EngineCore pid=474) INFO 04-22 15:56:03 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 670,160 tokens -(EngineCore pid=474) INFO 04-22 15:56:03 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 81.73x -(EngineCore pid=474) 2026-04-22 15:56:03,910 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=474) DEBUG 04-22 15:56:03 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=474) 2026-04-22 15:56:03,916 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=474) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 15:49:53 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 15:49:53 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 15:49:53 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 15:49:53 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 15:49:53 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 15:49:53 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model google/gemma-2-9b-it -(APIServer pid=1) INFO 04-22 15:49:53 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 15:49:53 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 15:49:53 [entrypoints/utils.py:233] non-default args: {'model_tag': 'google/gemma-2-9b-it', 'model': 'google/gemma-2-9b-it', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 15:49:53 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) Traceback (most recent call last): -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_http.py", line 403, in hf_raise_for_status -(APIServer pid=1) response.raise_for_status() -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/requests/models.py", line 1028, in raise_for_status -(APIServer pid=1) raise HTTPError(http_error_msg, response=self) -(APIServer pid=1) requests.exceptions.HTTPError: 403 Client Error: Forbidden for url: https://huggingface.co/google/gemma-2-9b-it/resolve/main/config.json -(APIServer pid=1) -(APIServer pid=1) The above exception was the direct cause of the following exception: -(APIServer pid=1) -(APIServer pid=1) Traceback (most recent call last): -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1550, in _get_metadata_or_catch_error -(APIServer pid=1) metadata = get_hf_file_metadata( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn -(APIServer pid=1) return fn(*args, **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1467, in get_hf_file_metadata -(APIServer pid=1) r = _request_wrapper( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 283, in _request_wrapper -(APIServer pid=1) response = _request_wrapper( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 307, in _request_wrapper -(APIServer pid=1) hf_raise_for_status(response) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_http.py", line 467, in hf_raise_for_status -(APIServer pid=1) raise _format(HfHubHTTPError, message, response) from e -(APIServer pid=1) huggingface_hub.errors.HfHubHTTPError: (Request ID: Root=1-69e8ee21-227b87a308f241507904e4ee;d5a6ca02-00be-4c30-9f6f-bc2dc9bc5644) -(APIServer pid=1) -(APIServer pid=1) 403 Forbidden: Please enable access to public gated repositories in your fine-grained token settings to view this repository.. -(APIServer pid=1) Cannot access content at: https://huggingface.co/google/gemma-2-9b-it/resolve/main/config.json. -(APIServer pid=1) Make sure your token has the correct permissions. -(APIServer pid=1) -(APIServer pid=1) The above exception was the direct cause of the following exception: -(APIServer pid=1) -(APIServer pid=1) Traceback (most recent call last): -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 479, in cached_files -(APIServer pid=1) hf_hub_download( -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn -(APIServer pid=1) return fn(*args, **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1014, in hf_hub_download -(APIServer pid=1) return _hf_hub_download_to_cache_dir( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1121, in _hf_hub_download_to_cache_dir -(APIServer pid=1) _raise_on_head_call_error(head_call_error, force_download, local_files_only) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1665, in _raise_on_head_call_error -(APIServer pid=1) raise LocalEntryNotFoundError( -(APIServer pid=1) huggingface_hub.errors.LocalEntryNotFoundError: An error happened while trying to locate the file on the Hub and we cannot find the requested files in the local cache. Please check your connection and try again or make sure your Internet connection is on. -(APIServer pid=1) -(APIServer pid=1) The above exception was the direct cause of the following exception: -(APIServer pid=1) -(APIServer pid=1) Traceback (most recent call last): -(APIServer pid=1) File "/usr/local/bin/vllm", line 10, in -(APIServer pid=1) sys.exit(main()) -(APIServer pid=1) ^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main -(APIServer pid=1) args.dispatch_function(args) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd -(APIServer pid=1) uvloop.run(run_server(args)) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run -(APIServer pid=1) return __asyncio.run( -(APIServer pid=1) ^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run -(APIServer pid=1) return runner.run(main) -(APIServer pid=1) ^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run -(APIServer pid=1) return self._loop.run_until_complete(task) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper -(APIServer pid=1) return await main -(APIServer pid=1) ^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server -(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker -(APIServer pid=1) async with build_async_engine_client( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ -(APIServer pid=1) return await anext(self.gen) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client -(APIServer pid=1) async with build_async_engine_client_from_engine_args( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ -(APIServer pid=1) return await anext(self.gen) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 124, in build_async_engine_client_from_engine_args -(APIServer pid=1) vllm_config = engine_args.create_engine_config(usage_context=usage_context) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/engine/arg_utils.py", line 1539, in create_engine_config -(APIServer pid=1) maybe_override_with_speculators( -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/transformers_utils/config.py", line 583, in maybe_override_with_speculators -(APIServer pid=1) config_dict, _ = PretrainedConfig.get_config_dict( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py", line 662, in get_config_dict -(APIServer pid=1) config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py", line 721, in _get_config_dict -(APIServer pid=1) resolved_config_file = cached_file( -(APIServer pid=1) ^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 322, in cached_file -(APIServer pid=1) file = cached_files(path_or_repo_id=path_or_repo_id, filenames=[filename], **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 553, in cached_files -(APIServer pid=1) raise OSError( -(APIServer pid=1) OSError: We couldn't connect to 'https://huggingface.co' to load the files, and couldn't find them in the cached files. -(APIServer pid=1) Check your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'. diff --git a/accuracy/results/v0.19.0/logs/google-gemma-2-9b-it--h100-80gb--tp1pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/google-gemma-2-9b-it--h100-80gb--tp1pp1dp1--8192.log deleted file mode 100644 index 98d9fb14..00000000 --- a/accuracy/results/v0.19.0/logs/google-gemma-2-9b-it--h100-80gb--tp1pp1dp1--8192.log +++ /dev/null @@ -1,749 +0,0 @@ -DEBUG 04-22 15:56:20 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 15:56:20 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 15:56:20 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 15:56:20 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 15:56:20 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 15:56:20 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 15:56:20 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 15:56:20 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 15:56:20 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 15:56:20 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 15:56:20 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 15:56:20 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 15:56:25 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 15:56:27 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' -DEBUG 04-22 15:56:27 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 15:56:27 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 15:56:27 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 15:56:27 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 15:56:27 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 15:56:27 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 15:56:27 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 15:56:27 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model google/gemma-2-9b-it -(APIServer pid=1) INFO 04-22 15:56:27 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 15:56:27 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 15:56:27 [entrypoints/utils.py:233] non-default args: {'model_tag': 'google/gemma-2-9b-it', 'model': 'google/gemma-2-9b-it', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 15:56:27 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 15:56:27 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.gemma2.Gemma2ForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 15:56:27 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003558 secs -(APIServer pid=1) INFO 04-22 15:56:27 [config/model.py:549] Resolved architecture: Gemma2ForCausalLM -(APIServer pid=1) INFO 04-22 15:56:27 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 15:56:27 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 15:56:27 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 15:56:27 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 15:56:27 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 15:56:27 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 15:56:27 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 15:56:27 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 15:56:27 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 15:56:27 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 15:56:29 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 15:56:29 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 15:56:33 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 15:56:33 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 15:56:33 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 15:56:33 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 15:56:33 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 15:56:33 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 15:56:33 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 15:56:33 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 15:56:33 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 15:56:33 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 15:56:33 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 15:56:33 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 15:56:38 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=285) DEBUG 04-22 15:56:39 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 15:56:39 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=285) DEBUG 04-22 15:56:39 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/b04747ea-0d9b-4301-a190-f022a3dff208'], outputs=['ipc:///tmp/a862b31a-8f2a-4a45-9ec5-5716564fee40'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=285) DEBUG 04-22 15:56:39 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=285) DEBUG 04-22 15:56:39 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=285) DEBUG 04-22 15:56:39 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=285) DEBUG 04-22 15:56:39 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=285) DEBUG 04-22 15:56:39 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=285) INFO 04-22 15:56:39 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='google/gemma-2-9b-it', speculative_config=None, tokenizer='google/gemma-2-9b-it', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=google/gemma-2-9b-it, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=285) DEBUG 04-22 15:56:40 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.10.211:46303 backend=nccl -(EngineCore pid=285) INFO 04-22 15:56:40 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.10.211:46303 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=285) DEBUG 04-22 15:56:40 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=285) INFO 04-22 15:56:40 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=285) DEBUG 04-22 15:56:40 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776873400.8206272, auto_measure=True -(EngineCore pid=285) DEBUG 04-22 15:56:40 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=285) DEBUG 04-22 15:56:40 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=285) DEBUG 04-22 15:56:40 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=285) DEBUG 04-22 15:56:40 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=285) DEBUG 04-22 15:56:40 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=285) DEBUG 04-22 15:56:40 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=285) DEBUG 04-22 15:56:41 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=285) DEBUG 04-22 15:56:41 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=285) DEBUG 04-22 15:56:41 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=285) INFO 04-22 15:56:41 [v1/worker/gpu_model_runner.py:4735] Starting to load model google/gemma-2-9b-it... -(EngineCore pid=285) DEBUG 04-22 15:56:41 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=256, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=285) INFO 04-22 15:56:41 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=285) INFO 04-22 15:56:41 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=285) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=285) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=285) DEBUG 04-22 15:56:41 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=285) DEBUG 04-22 15:56:41 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=285) DEBUG 04-22 15:56:41 [config/compilation.py:1195] disabled custom ops: Counter({'gemma_rms_norm': 169, 'gelu_and_mul': 42, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'logits_processor': 1}) -(EngineCore pid=285) DEBUG 04-22 15:56:41 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=285) DEBUG 04-22 15:56:42 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00001-of-00004.safetensors']] -(APIServer pid=1) DEBUG 04-22 15:56:49 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 15:56:59 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=285) INFO 04-22 15:57:07 [model_executor/model_loader/weight_utils.py:581] Time spent downloading weights for google/gemma-2-9b-it: 25.415698 seconds -(EngineCore pid=285) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 -(APIServer pid=1) DEBUG 04-22 15:57:29 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/gemma2.py -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=285) INFO 04-22 15:57:30 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/938dccb18b/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=2fa46dba6a comp=e546579c48 code=1effc1a117077c34d7781a48a74d1e9a6c34076dbd4b7937695d68b6a3d23a2c dir=/data/.cache/vllm/torch_compile_cache/938dccb18b/rank_0_0/backbone -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/backends.py:1074] Vllm config hash: 2fa46dba6a -(EngineCore pid=285) INFO 04-22 15:57:30 [compilation/backends.py:1111] Dynamo bytecode transform time: 5.96 s -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=285) DEBUG 04-22 15:57:30 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=285) DEBUG 04-22 15:57:31 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=285) DEBUG 04-22 15:57:31 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms -(EngineCore pid=285) DEBUG 04-22 15:57:31 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms -(EngineCore pid=285) DEBUG 04-22 15:57:31 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=285) DEBUG 04-22 15:57:31 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=285) INFO 04-22 15:57:32 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=285) DEBUG 04-22 15:57:32 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/938dccb18b/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=285) DEBUG 04-22 15:57:33 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=285) DEBUG 04-22 15:57:33 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(EngineCore pid=285) DEBUG 04-22 15:57:33 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.4 ms -(EngineCore pid=285) DEBUG 04-22 15:57:33 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=285) DEBUG 04-22 15:57:33 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=285) DEBUG 04-22 15:57:35 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/938dccb18b/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=285) DEBUG 04-22 15:57:36 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=285) DEBUG 04-22 15:57:36 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.2 ms -(EngineCore pid=285) DEBUG 04-22 15:57:36 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.6 ms -(EngineCore pid=285) DEBUG 04-22 15:57:36 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=285) DEBUG 04-22 15:57:36 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(EngineCore pid=285) DEBUG 04-22 15:57:36 [compilation/backends.py:377] Store the 42-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_42', '/data/.cache/vllm/torch_compile_cache/938dccb18b/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_42') -(EngineCore pid=285) INFO 04-22 15:57:36 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 6.31 s -(EngineCore pid=285) DEBUG 04-22 15:57:36 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/938dccb18b/rank_0_0/backbone/computation_graph.py -(EngineCore pid=285) INFO 04-22 15:57:38 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/e599b74f8707bd2734d2d72d08671f09975bf502f65698747fb9e3a189bfc9db/rank_0_0/model -(EngineCore pid=285) INFO 04-22 15:57:38 [compilation/monitor.py:48] torch.compile took 14.31 s in total -(EngineCore pid=285) INFO 04-22 15:57:38 [compilation/monitor.py:76] Initial profiling/warmup run took 0.25 s -(APIServer pid=1) DEBUG 04-22 15:57:39 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=285) INFO 04-22 15:57:44 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=285) DEBUG 04-22 15:57:44 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=285) INFO 04-22 15:57:44 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=285) DEBUG 04-22 15:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=285) DEBUG 04-22 15:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=285) DEBUG 04-22 15:57:44 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=285) DEBUG 04-22 15:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=285) DEBUG 04-22 15:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=285) DEBUG 04-22 15:57:44 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=285) DEBUG 04-22 15:57:44 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 146.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=285) DEBUG 04-22 15:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=285) DEBUG 04-22 15:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=285) DEBUG 04-22 15:57:44 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=285) DEBUG 04-22 15:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=285) DEBUG 04-22 15:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=285) DEBUG 04-22 15:57:44 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=285) DEBUG 04-22 15:57:44 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 262.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=285) DEBUG 04-22 15:57:45 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=285) INFO 04-22 15:57:45 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.84 GiB total -(EngineCore pid=285) DEBUG 04-22 15:57:45 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=285) DEBUG 04-22 15:57:45 [v1/worker/gpu_worker.py:430] Free memory after profiling: 59.37 GiB (total), 55.92 GiB (within requested) -(EngineCore pid=285) DEBUG 04-22 15:57:45 [v1/worker/gpu_worker.py:435] Memory profiling takes 21.51 seconds. Total non KV cache memory: 21.11GiB; torch peak memory increase: 3.65GiB; non-torch forward increase memory: 0.25GiB; weights memory: 17.22GiB. -(EngineCore pid=285) INFO 04-22 15:57:45 [v1/worker/gpu_worker.py:436] Available KV cache memory: 54.12 GiB -(EngineCore pid=285) INFO 04-22 15:57:45 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9606 to maintain the same effective KV cache size. -(EngineCore pid=285) INFO 04-22 15:57:45 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 168,880 tokens -(EngineCore pid=285) INFO 04-22 15:57:45 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 20.60x -(EngineCore pid=285) 2026-04-22 15:57:45,711 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=285) DEBUG 04-22 15:57:45 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=285) 2026-04-22 15:57:45,720 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=285) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 15:50:32 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 15:50:32 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 15:50:32 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 15:50:32 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 15:50:32 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 15:50:32 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model google/gemma-3-12b-it -(APIServer pid=1) INFO 04-22 15:50:32 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 15:50:32 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 15:50:32 [entrypoints/utils.py:233] non-default args: {'model_tag': 'google/gemma-3-12b-it', 'model': 'google/gemma-3-12b-it', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 15:50:32 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) Traceback (most recent call last): -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_http.py", line 403, in hf_raise_for_status -(APIServer pid=1) response.raise_for_status() -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/requests/models.py", line 1028, in raise_for_status -(APIServer pid=1) raise HTTPError(http_error_msg, response=self) -(APIServer pid=1) requests.exceptions.HTTPError: 403 Client Error: Forbidden for url: https://huggingface.co/google/gemma-3-12b-it/resolve/main/config.json -(APIServer pid=1) -(APIServer pid=1) The above exception was the direct cause of the following exception: -(APIServer pid=1) -(APIServer pid=1) Traceback (most recent call last): -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1550, in _get_metadata_or_catch_error -(APIServer pid=1) metadata = get_hf_file_metadata( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn -(APIServer pid=1) return fn(*args, **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1467, in get_hf_file_metadata -(APIServer pid=1) r = _request_wrapper( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 283, in _request_wrapper -(APIServer pid=1) response = _request_wrapper( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 307, in _request_wrapper -(APIServer pid=1) hf_raise_for_status(response) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_http.py", line 467, in hf_raise_for_status -(APIServer pid=1) raise _format(HfHubHTTPError, message, response) from e -(APIServer pid=1) huggingface_hub.errors.HfHubHTTPError: (Request ID: Root=1-69e8ee48-558ebd955f7a279229ca7b78;f864c0bf-514c-4724-85b6-fb6b29a05337) -(APIServer pid=1) -(APIServer pid=1) 403 Forbidden: Please enable access to public gated repositories in your fine-grained token settings to view this repository.. -(APIServer pid=1) Cannot access content at: https://huggingface.co/google/gemma-3-12b-it/resolve/main/config.json. -(APIServer pid=1) Make sure your token has the correct permissions. -(APIServer pid=1) -(APIServer pid=1) The above exception was the direct cause of the following exception: -(APIServer pid=1) -(APIServer pid=1) Traceback (most recent call last): -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 479, in cached_files -(APIServer pid=1) hf_hub_download( -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn -(APIServer pid=1) return fn(*args, **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1014, in hf_hub_download -(APIServer pid=1) return _hf_hub_download_to_cache_dir( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1121, in _hf_hub_download_to_cache_dir -(APIServer pid=1) _raise_on_head_call_error(head_call_error, force_download, local_files_only) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1665, in _raise_on_head_call_error -(APIServer pid=1) raise LocalEntryNotFoundError( -(APIServer pid=1) huggingface_hub.errors.LocalEntryNotFoundError: An error happened while trying to locate the file on the Hub and we cannot find the requested files in the local cache. Please check your connection and try again or make sure your Internet connection is on. -(APIServer pid=1) -(APIServer pid=1) The above exception was the direct cause of the following exception: -(APIServer pid=1) -(APIServer pid=1) Traceback (most recent call last): -(APIServer pid=1) File "/usr/local/bin/vllm", line 10, in -(APIServer pid=1) sys.exit(main()) -(APIServer pid=1) ^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main -(APIServer pid=1) args.dispatch_function(args) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd -(APIServer pid=1) uvloop.run(run_server(args)) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run -(APIServer pid=1) return __asyncio.run( -(APIServer pid=1) ^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run -(APIServer pid=1) return runner.run(main) -(APIServer pid=1) ^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run -(APIServer pid=1) return self._loop.run_until_complete(task) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper -(APIServer pid=1) return await main -(APIServer pid=1) ^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server -(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker -(APIServer pid=1) async with build_async_engine_client( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ -(APIServer pid=1) return await anext(self.gen) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client -(APIServer pid=1) async with build_async_engine_client_from_engine_args( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ -(APIServer pid=1) return await anext(self.gen) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 124, in build_async_engine_client_from_engine_args -(APIServer pid=1) vllm_config = engine_args.create_engine_config(usage_context=usage_context) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/engine/arg_utils.py", line 1539, in create_engine_config -(APIServer pid=1) maybe_override_with_speculators( -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/transformers_utils/config.py", line 583, in maybe_override_with_speculators -(APIServer pid=1) config_dict, _ = PretrainedConfig.get_config_dict( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py", line 662, in get_config_dict -(APIServer pid=1) config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py", line 721, in _get_config_dict -(APIServer pid=1) resolved_config_file = cached_file( -(APIServer pid=1) ^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 322, in cached_file -(APIServer pid=1) file = cached_files(path_or_repo_id=path_or_repo_id, filenames=[filename], **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 553, in cached_files -(APIServer pid=1) raise OSError( -(APIServer pid=1) OSError: We couldn't connect to 'https://huggingface.co' to load the files, and couldn't find them in the cached files. -(APIServer pid=1) Check your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'. diff --git a/accuracy/results/v0.19.0/logs/google-gemma-3-12b-it--h100-80gb--tp1pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/google-gemma-3-12b-it--h100-80gb--tp1pp1dp1--8192.log deleted file mode 100644 index 354a2364..00000000 --- a/accuracy/results/v0.19.0/logs/google-gemma-3-12b-it--h100-80gb--tp1pp1dp1--8192.log +++ /dev/null @@ -1,769 +0,0 @@ -DEBUG 04-22 16:04:02 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 16:04:02 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 16:04:02 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 16:04:02 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 16:04:02 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 16:04:02 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 16:04:02 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 16:04:02 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 16:04:02 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 16:04:02 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 16:04:02 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 16:04:02 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 16:04:07 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 16:04:09 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' -DEBUG 04-22 16:04:09 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 16:04:09 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 16:04:09 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 16:04:09 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 16:04:09 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 16:04:09 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 16:04:09 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 16:04:09 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model google/gemma-3-12b-it -(APIServer pid=1) INFO 04-22 16:04:09 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 16:04:09 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 16:04:09 [entrypoints/utils.py:233] non-default args: {'model_tag': 'google/gemma-3-12b-it', 'model': 'google/gemma-3-12b-it', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 16:04:09 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 16:04:09 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.gemma3_mm.Gemma3ForConditionalGeneration from cache -(APIServer pid=1) DEBUG 04-22 16:04:09 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0004434 secs -(APIServer pid=1) INFO 04-22 16:04:09 [config/model.py:549] Resolved architecture: Gemma3ForConditionalGeneration -(APIServer pid=1) INFO 04-22 16:04:09 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 16:04:09 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 16:04:09 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 16:04:09 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 16:04:09 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 16:04:09 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 16:04:09 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 16:04:09 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) WARNING 04-22 16:04:09 [platforms/cuda.py:199] Forcing --disable_chunked_mm_input for models with multimodal-bidirectional attention. -(APIServer pid=1) DEBUG 04-22 16:04:09 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 16:04:09 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 16:04:12 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 16:04:13 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. -(APIServer pid=1) DEBUG 04-22 16:04:14 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -(APIServer pid=1) Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. -DEBUG 04-22 16:04:26 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 16:04:26 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 16:04:26 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 16:04:26 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 16:04:26 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 16:04:26 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 16:04:26 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 16:04:26 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 16:04:26 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 16:04:26 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 16:04:26 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 16:04:26 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 16:04:31 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(APIServer pid=1) DEBUG 04-22 16:04:32 [v1/engine/utils.py:1042] Waiting for 1 local, 0 remote core engine proc(s) to connect. -(EngineCore pid=510) DEBUG 04-22 16:04:33 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 16:04:33 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=510) DEBUG 04-22 16:04:33 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/908ab5ba-95d5-4a37-881e-6edc152f135d'], outputs=['ipc:///tmp/36f6cabd-2ae1-4604-93cc-5accfd67e0cd'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=510) DEBUG 04-22 16:04:33 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=510) DEBUG 04-22 16:04:33 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=510) DEBUG 04-22 16:04:33 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=510) DEBUG 04-22 16:04:33 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=510) DEBUG 04-22 16:04:33 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=510) INFO 04-22 16:04:33 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='google/gemma-3-12b-it', speculative_config=None, tokenizer='google/gemma-3-12b-it', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=google/gemma-3-12b-it, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=510) DEBUG 04-22 16:04:33 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(EngineCore pid=510) DEBUG 04-22 16:04:35 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.10.219:60697 backend=nccl -(EngineCore pid=510) INFO 04-22 16:04:35 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.10.219:60697 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=510) DEBUG 04-22 16:04:35 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=510) INFO 04-22 16:04:35 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=510) DEBUG 04-22 16:04:35 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776873875.928553, auto_measure=True -(EngineCore pid=510) DEBUG 04-22 16:04:35 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=510) DEBUG 04-22 16:04:36 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=510) DEBUG 04-22 16:04:36 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=510) DEBUG 04-22 16:04:36 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=510) DEBUG 04-22 16:04:36 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=510) DEBUG 04-22 16:04:36 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=510) DEBUG 04-22 16:04:36 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=510) DEBUG 04-22 16:04:36 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. -(EngineCore pid=510) Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. -(APIServer pid=1) DEBUG 04-22 16:04:43 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=510) DEBUG 04-22 16:04:44 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=510) INFO 04-22 16:04:44 [v1/worker/gpu_model_runner.py:4735] Starting to load model google/gemma-3-12b-it... -(EngineCore pid=510) INFO 04-22 16:04:45 [model_executor/models/interfaces.py:171] Contains out of vocabulary multimodal tokens? False -(EngineCore pid=510) INFO 04-22 16:04:45 [platforms/cuda.py:390] Using backend AttentionBackendEnum.FLASH_ATTN for vit attention -(EngineCore pid=510) INFO 04-22 16:04:45 [model_executor/.../attention/mm_encoder_attention.py:230] Using AttentionBackendEnum.FLASH_ATTN for MMEncoderAttention. -(EngineCore pid=510) INFO 04-22 16:04:45 [config/vllm.py:790] Asynchronous scheduling is enabled. -(EngineCore pid=510) DEBUG 04-22 16:04:45 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=510) DEBUG 04-22 16:04:45 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=256, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=True, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {FLASH_ATTN: [partial multimodal token full attention not supported], FLASHINFER: [partial multimodal token full attention not supported]}. -(EngineCore pid=510) INFO 04-22 16:04:45 [platforms/cuda.py:334] Using TRITON_ATTN attention backend out of potential backends: ['TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=510) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=510) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=510) DEBUG 04-22 16:04:45 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=510) DEBUG 04-22 16:04:45 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 27}) -(EngineCore pid=510) DEBUG 04-22 16:04:45 [config/compilation.py:1195] disabled custom ops: Counter({'gemma_rms_norm': 290, 'gelu_and_mul': 48, 'rotary_embedding': 2, 'apply_rotary_emb': 2, 'conv2d': 1, 'vocab_parallel_embedding': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=510) DEBUG 04-22 16:04:45 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 27}) -(EngineCore pid=510) DEBUG 04-22 16:04:45 [config/compilation.py:1195] disabled custom ops: Counter({'gemma_rms_norm': 290, 'gelu_and_mul': 48, 'rotary_embedding': 2, 'apply_rotary_emb': 2, 'conv2d': 1, 'vocab_parallel_embedding': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=510) DEBUG 04-22 16:04:45 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=510) DEBUG 04-22 16:04:45 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00003-of-00005.safetensors', 'model-00005-of-00005.safetensors', 'model-00001-of-00005.safetensors', 'model-00002-of-00005.safetensors', 'model-00004-of-00005.safetensors']] -(APIServer pid=1) DEBUG 04-22 16:04:53 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 16:05:03 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 16:05:13 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 16:05:23 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=510) INFO 04-22 16:05:24 [model_executor/model_loader/weight_utils.py:581] Time spent downloading weights for google/gemma-3-12b-it: 38.575716 seconds -(EngineCore pid=510) Loading safetensors checkpoint shards: 0% Completed | 0/5 [00:00 -(APIServer pid=1) DEBUG 04-22 16:05:53 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/gemma3.py -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=510) INFO 04-22 16:05:55 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/f76d04716e/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=af89a8e94b comp=e546579c48 code=616836617014c5f9fc7251fa87e8f62cc5716448a3f7628492691f84ac2574d9 dir=/data/.cache/vllm/torch_compile_cache/f76d04716e/rank_0_0/backbone -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=510) DEBUG 04-22 16:05:55 [compilation/backends.py:1074] Vllm config hash: af89a8e94b -(EngineCore pid=510) INFO 04-22 16:05:55 [compilation/backends.py:1111] Dynamo bytecode transform time: 7.88 s -(EngineCore pid=510) DEBUG 04-22 16:05:56 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=510) DEBUG 04-22 16:05:56 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=510) DEBUG 04-22 16:05:57 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=510) DEBUG 04-22 16:05:57 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(EngineCore pid=510) DEBUG 04-22 16:05:57 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.0 ms -(EngineCore pid=510) DEBUG 04-22 16:05:57 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=510) DEBUG 04-22 16:05:57 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=510) INFO 04-22 16:06:01 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=510) DEBUG 04-22 16:06:01 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/f76d04716e/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=510) DEBUG 04-22 16:06:01 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=510) DEBUG 04-22 16:06:01 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(EngineCore pid=510) DEBUG 04-22 16:06:01 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.7 ms -(EngineCore pid=510) DEBUG 04-22 16:06:01 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=510) DEBUG 04-22 16:06:01 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(APIServer pid=1) DEBUG 04-22 16:06:03 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=510) DEBUG 04-22 16:06:04 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/f76d04716e/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=510) DEBUG 04-22 16:06:04 [compilation/backends.py:377] Store the 2-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_2', '/data/.cache/vllm/torch_compile_cache/f76d04716e/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_2') -(EngineCore pid=510) DEBUG 04-22 16:06:05 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=510) DEBUG 04-22 16:06:05 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(EngineCore pid=510) DEBUG 04-22 16:06:05 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.7 ms -(EngineCore pid=510) DEBUG 04-22 16:06:05 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=510) DEBUG 04-22 16:06:05 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=510) DEBUG 04-22 16:06:07 [compilation/backends.py:377] Store the 5-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_5', '/data/.cache/vllm/torch_compile_cache/f76d04716e/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_5') -(EngineCore pid=510) DEBUG 04-22 16:06:08 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=510) DEBUG 04-22 16:06:08 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.3 ms -(EngineCore pid=510) DEBUG 04-22 16:06:08 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.5 ms -(EngineCore pid=510) DEBUG 04-22 16:06:08 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=510) DEBUG 04-22 16:06:08 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(EngineCore pid=510) DEBUG 04-22 16:06:08 [compilation/backends.py:377] Store the 48-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_48', '/data/.cache/vllm/torch_compile_cache/f76d04716e/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_48') -(EngineCore pid=510) INFO 04-22 16:06:08 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 12.51 s -(EngineCore pid=510) DEBUG 04-22 16:06:08 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/f76d04716e/rank_0_0/backbone/computation_graph.py -(EngineCore pid=510) INFO 04-22 16:06:11 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/172a779f211974e33e1f8d0b4be257e092b93b266784f6a1ef47398d3ac31e46/rank_0_0/model -(EngineCore pid=510) INFO 04-22 16:06:11 [compilation/monitor.py:48] torch.compile took 23.21 s in total -(EngineCore pid=510) INFO 04-22 16:06:11 [compilation/monitor.py:76] Initial profiling/warmup run took 0.41 s -(APIServer pid=1) DEBUG 04-22 16:06:13 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=510) INFO 04-22 16:06:17 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=510) DEBUG 04-22 16:06:17 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=510) INFO 04-22 16:06:17 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=510) DEBUG 04-22 16:06:18 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=510) DEBUG 04-22 16:06:18 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=510) DEBUG 04-22 16:06:18 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=510) DEBUG 04-22 16:06:18 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=510) DEBUG 04-22 16:06:18 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=510) DEBUG 04-22 16:06:18 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=510) DEBUG 04-22 16:06:18 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 146.00 MiB first-capture + (51-1) × 10.00 MiB per-graph -(EngineCore pid=510) DEBUG 04-22 16:06:18 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=510) DEBUG 04-22 16:06:19 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=510) DEBUG 04-22 16:06:19 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=510) DEBUG 04-22 16:06:19 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=510) DEBUG 04-22 16:06:19 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=510) DEBUG 04-22 16:06:19 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=510) DEBUG 04-22 16:06:19 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 4.00 MiB first-capture + (51-1) × 8.00 MiB per-graph -(EngineCore pid=510) DEBUG 04-22 16:06:20 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=510) INFO 04-22 16:06:20 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.02 GiB total -(EngineCore pid=510) DEBUG 04-22 16:06:20 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=510) DEBUG 04-22 16:06:20 [v1/worker/gpu_worker.py:430] Free memory after profiling: 54.52 GiB (total), 51.07 GiB (within requested) -(EngineCore pid=510) DEBUG 04-22 16:06:20 [v1/worker/gpu_worker.py:435] Memory profiling takes 33.32 seconds. Total non KV cache memory: 27.51GiB; torch peak memory increase: 3.94GiB; non-torch forward increase memory: 0.25GiB; weights memory: 23.31GiB. -(EngineCore pid=510) INFO 04-22 16:06:20 [v1/worker/gpu_worker.py:436] Available KV cache memory: 47.72 GiB -(EngineCore pid=510) INFO 04-22 16:06:20 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9629 to maintain the same effective KV cache size. -(EngineCore pid=510) INFO 04-22 16:06:20 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 130,304 tokens -(EngineCore pid=510) INFO 04-22 16:06:20 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 15.88x -(EngineCore pid=510) 2026-04-22 16:06:20,756 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=510) DEBUG 04-22 16:06:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=510) 2026-04-22 16:06:20,771 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=510) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 15:50:45 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 15:50:45 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 15:50:45 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 15:50:45 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 15:50:45 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 15:50:45 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model google/gemma-3-27b-it -(APIServer pid=1) INFO 04-22 15:50:45 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 15:50:45 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 15:50:45 [entrypoints/utils.py:233] non-default args: {'model_tag': 'google/gemma-3-27b-it', 'model': 'google/gemma-3-27b-it', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 15:50:45 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) Traceback (most recent call last): -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_http.py", line 403, in hf_raise_for_status -(APIServer pid=1) response.raise_for_status() -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/requests/models.py", line 1028, in raise_for_status -(APIServer pid=1) raise HTTPError(http_error_msg, response=self) -(APIServer pid=1) requests.exceptions.HTTPError: 403 Client Error: Forbidden for url: https://huggingface.co/google/gemma-3-27b-it/resolve/main/config.json -(APIServer pid=1) -(APIServer pid=1) The above exception was the direct cause of the following exception: -(APIServer pid=1) -(APIServer pid=1) Traceback (most recent call last): -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1550, in _get_metadata_or_catch_error -(APIServer pid=1) metadata = get_hf_file_metadata( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn -(APIServer pid=1) return fn(*args, **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1467, in get_hf_file_metadata -(APIServer pid=1) r = _request_wrapper( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 283, in _request_wrapper -(APIServer pid=1) response = _request_wrapper( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 307, in _request_wrapper -(APIServer pid=1) hf_raise_for_status(response) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_http.py", line 467, in hf_raise_for_status -(APIServer pid=1) raise _format(HfHubHTTPError, message, response) from e -(APIServer pid=1) huggingface_hub.errors.HfHubHTTPError: (Request ID: Root=1-69e8ee56-748b87a6382fbdc8082d9120;9cf089a8-e061-415a-b1d0-5ee33e12c23e) -(APIServer pid=1) -(APIServer pid=1) 403 Forbidden: Please enable access to public gated repositories in your fine-grained token settings to view this repository.. -(APIServer pid=1) Cannot access content at: https://huggingface.co/google/gemma-3-27b-it/resolve/main/config.json. -(APIServer pid=1) Make sure your token has the correct permissions. -(APIServer pid=1) -(APIServer pid=1) The above exception was the direct cause of the following exception: -(APIServer pid=1) -(APIServer pid=1) Traceback (most recent call last): -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 479, in cached_files -(APIServer pid=1) hf_hub_download( -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn -(APIServer pid=1) return fn(*args, **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1014, in hf_hub_download -(APIServer pid=1) return _hf_hub_download_to_cache_dir( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1121, in _hf_hub_download_to_cache_dir -(APIServer pid=1) _raise_on_head_call_error(head_call_error, force_download, local_files_only) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1665, in _raise_on_head_call_error -(APIServer pid=1) raise LocalEntryNotFoundError( -(APIServer pid=1) huggingface_hub.errors.LocalEntryNotFoundError: An error happened while trying to locate the file on the Hub and we cannot find the requested files in the local cache. Please check your connection and try again or make sure your Internet connection is on. -(APIServer pid=1) -(APIServer pid=1) The above exception was the direct cause of the following exception: -(APIServer pid=1) -(APIServer pid=1) Traceback (most recent call last): -(APIServer pid=1) File "/usr/local/bin/vllm", line 10, in -(APIServer pid=1) sys.exit(main()) -(APIServer pid=1) ^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main -(APIServer pid=1) args.dispatch_function(args) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd -(APIServer pid=1) uvloop.run(run_server(args)) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run -(APIServer pid=1) return __asyncio.run( -(APIServer pid=1) ^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run -(APIServer pid=1) return runner.run(main) -(APIServer pid=1) ^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run -(APIServer pid=1) return self._loop.run_until_complete(task) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper -(APIServer pid=1) return await main -(APIServer pid=1) ^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server -(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker -(APIServer pid=1) async with build_async_engine_client( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ -(APIServer pid=1) return await anext(self.gen) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client -(APIServer pid=1) async with build_async_engine_client_from_engine_args( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ -(APIServer pid=1) return await anext(self.gen) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 124, in build_async_engine_client_from_engine_args -(APIServer pid=1) vllm_config = engine_args.create_engine_config(usage_context=usage_context) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/engine/arg_utils.py", line 1539, in create_engine_config -(APIServer pid=1) maybe_override_with_speculators( -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/transformers_utils/config.py", line 583, in maybe_override_with_speculators -(APIServer pid=1) config_dict, _ = PretrainedConfig.get_config_dict( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py", line 662, in get_config_dict -(APIServer pid=1) config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py", line 721, in _get_config_dict -(APIServer pid=1) resolved_config_file = cached_file( -(APIServer pid=1) ^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 322, in cached_file -(APIServer pid=1) file = cached_files(path_or_repo_id=path_or_repo_id, filenames=[filename], **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 553, in cached_files -(APIServer pid=1) raise OSError( -(APIServer pid=1) OSError: We couldn't connect to 'https://huggingface.co' to load the files, and couldn't find them in the cached files. -(APIServer pid=1) Check your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'. diff --git a/accuracy/results/v0.19.0/logs/google-gemma-3-27b-it--h100-80gb--tp1pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/google-gemma-3-27b-it--h100-80gb--tp1pp1dp1--8192.log deleted file mode 100644 index df501599..00000000 --- a/accuracy/results/v0.19.0/logs/google-gemma-3-27b-it--h100-80gb--tp1pp1dp1--8192.log +++ /dev/null @@ -1,783 +0,0 @@ -DEBUG 04-22 16:06:52 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 16:06:52 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 16:06:52 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 16:06:52 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 16:06:52 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 16:06:52 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 16:06:52 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 16:06:52 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 16:06:52 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 16:06:52 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 16:06:52 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 16:06:52 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 16:06:57 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 16:06:59 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' -DEBUG 04-22 16:06:59 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 16:06:59 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 16:06:59 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 16:06:59 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 16:06:59 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 16:06:59 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 16:06:59 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 16:06:59 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model google/gemma-3-27b-it -(APIServer pid=1) INFO 04-22 16:06:59 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 16:06:59 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 16:06:59 [entrypoints/utils.py:233] non-default args: {'model_tag': 'google/gemma-3-27b-it', 'model': 'google/gemma-3-27b-it', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 16:06:59 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 16:06:59 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.gemma3_mm.Gemma3ForConditionalGeneration from cache -(APIServer pid=1) DEBUG 04-22 16:06:59 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003820 secs -(APIServer pid=1) INFO 04-22 16:06:59 [config/model.py:549] Resolved architecture: Gemma3ForConditionalGeneration -(APIServer pid=1) INFO 04-22 16:06:59 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 16:06:59 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 16:06:59 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 16:06:59 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 16:06:59 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 16:06:59 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 16:06:59 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 16:06:59 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) WARNING 04-22 16:06:59 [platforms/cuda.py:199] Forcing --disable_chunked_mm_input for models with multimodal-bidirectional attention. -(APIServer pid=1) DEBUG 04-22 16:06:59 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 16:06:59 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 16:07:02 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 16:07:03 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. -(APIServer pid=1) DEBUG 04-22 16:07:03 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -(APIServer pid=1) Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. -DEBUG 04-22 16:07:15 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 16:07:15 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 16:07:15 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 16:07:15 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 16:07:15 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 16:07:15 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 16:07:15 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 16:07:15 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 16:07:15 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 16:07:15 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 16:07:15 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 16:07:15 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 16:07:20 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=509) DEBUG 04-22 16:07:22 [v1/engine/core.py:1018] Waiting for init message from front-end. -(EngineCore pid=509) DEBUG 04-22 16:07:22 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/1376714c-14a9-478f-94c9-ec390c1e5ba5'], outputs=['ipc:///tmp/528325d5-7c95-4fb2-89cd-d7206a1f3d4b'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(APIServer pid=1) DEBUG 04-22 16:07:22 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=509) DEBUG 04-22 16:07:22 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=509) DEBUG 04-22 16:07:22 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=509) DEBUG 04-22 16:07:22 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=509) DEBUG 04-22 16:07:22 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=509) DEBUG 04-22 16:07:22 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=509) INFO 04-22 16:07:22 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='google/gemma-3-27b-it', speculative_config=None, tokenizer='google/gemma-3-27b-it', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=google/gemma-3-27b-it, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=509) DEBUG 04-22 16:07:22 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(EngineCore pid=509) DEBUG 04-22 16:07:24 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.10.228:44139 backend=nccl -(EngineCore pid=509) INFO 04-22 16:07:24 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.10.228:44139 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=509) DEBUG 04-22 16:07:24 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=509) INFO 04-22 16:07:24 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=509) DEBUG 04-22 16:07:24 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776874044.4904332, auto_measure=True -(EngineCore pid=509) DEBUG 04-22 16:07:24 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=509) DEBUG 04-22 16:07:24 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=509) DEBUG 04-22 16:07:24 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=509) DEBUG 04-22 16:07:24 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=509) DEBUG 04-22 16:07:24 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=509) DEBUG 04-22 16:07:24 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=509) DEBUG 04-22 16:07:24 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=509) DEBUG 04-22 16:07:24 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. -(EngineCore pid=509) Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. -(APIServer pid=1) DEBUG 04-22 16:07:32 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=509) DEBUG 04-22 16:07:32 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=509) INFO 04-22 16:07:32 [v1/worker/gpu_model_runner.py:4735] Starting to load model google/gemma-3-27b-it... -(EngineCore pid=509) INFO 04-22 16:07:32 [model_executor/models/interfaces.py:171] Contains out of vocabulary multimodal tokens? False -(EngineCore pid=509) INFO 04-22 16:07:32 [platforms/cuda.py:390] Using backend AttentionBackendEnum.FLASH_ATTN for vit attention -(EngineCore pid=509) INFO 04-22 16:07:32 [model_executor/.../attention/mm_encoder_attention.py:230] Using AttentionBackendEnum.FLASH_ATTN for MMEncoderAttention. -(EngineCore pid=509) INFO 04-22 16:07:32 [config/vllm.py:790] Asynchronous scheduling is enabled. -(EngineCore pid=509) DEBUG 04-22 16:07:32 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=509) DEBUG 04-22 16:07:33 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=True, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {FLASH_ATTN: [partial multimodal token full attention not supported], FLASHINFER: [partial multimodal token full attention not supported]}. -(EngineCore pid=509) INFO 04-22 16:07:33 [platforms/cuda.py:334] Using TRITON_ATTN attention backend out of potential backends: ['TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=509) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=509) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=509) DEBUG 04-22 16:07:33 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=509) DEBUG 04-22 16:07:33 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 27}) -(EngineCore pid=509) DEBUG 04-22 16:07:33 [config/compilation.py:1195] disabled custom ops: Counter({'gemma_rms_norm': 374, 'gelu_and_mul': 62, 'rotary_embedding': 2, 'apply_rotary_emb': 2, 'conv2d': 1, 'vocab_parallel_embedding': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=509) DEBUG 04-22 16:07:33 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 27}) -(EngineCore pid=509) DEBUG 04-22 16:07:33 [config/compilation.py:1195] disabled custom ops: Counter({'gemma_rms_norm': 374, 'gelu_and_mul': 62, 'rotary_embedding': 2, 'apply_rotary_emb': 2, 'conv2d': 1, 'vocab_parallel_embedding': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=509) DEBUG 04-22 16:07:33 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=509) DEBUG 04-22 16:07:33 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00012-of-00012.safetensors', 'model-00010-of-00012.safetensors', 'model-00003-of-00012.safetensors', 'model-00006-of-00012.safetensors', 'model-00007-of-00012.safetensors', 'model-00005-of-00012.safetensors', 'model-00001-of-00012.safetensors', 'model-00009-of-00012.safetensors', 'model-00011-of-00012.safetensors', 'model-00004-of-00012.safetensors', 'model-00008-of-00012.safetensors', 'model-00002-of-00012.safetensors']] -(APIServer pid=1) DEBUG 04-22 16:07:42 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 16:07:52 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 16:08:02 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 16:08:12 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 16:08:22 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 16:08:32 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 16:08:42 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=509) INFO 04-22 16:08:48 [model_executor/model_loader/weight_utils.py:581] Time spent downloading weights for google/gemma-3-27b-it: 75.376589 seconds -(EngineCore pid=509) Loading safetensors checkpoint shards: 0% Completed | 0/12 [00:00 -(APIServer pid=1) DEBUG 04-22 16:09:42 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/gemma3.py -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=509) INFO 04-22 16:09:46 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/2f8ea44192/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=cb6dae3406 comp=e546579c48 code=616836617014c5f9fc7251fa87e8f62cc5716448a3f7628492691f84ac2574d9 dir=/data/.cache/vllm/torch_compile_cache/2f8ea44192/rank_0_0/backbone -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/backends.py:1074] Vllm config hash: cb6dae3406 -(EngineCore pid=509) INFO 04-22 16:09:46 [compilation/backends.py:1111] Dynamo bytecode transform time: 9.32 s -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=509) DEBUG 04-22 16:09:46 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=509) DEBUG 04-22 16:09:47 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=509) DEBUG 04-22 16:09:47 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 3.3 ms -(EngineCore pid=509) DEBUG 04-22 16:09:47 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 5.0 ms -(EngineCore pid=509) DEBUG 04-22 16:09:47 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=509) DEBUG 04-22 16:09:47 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(EngineCore pid=509) INFO 04-22 16:09:51 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=509) DEBUG 04-22 16:09:51 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/2f8ea44192/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=509) DEBUG 04-22 16:09:51 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=509) DEBUG 04-22 16:09:51 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(EngineCore pid=509) DEBUG 04-22 16:09:51 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.7 ms -(EngineCore pid=509) DEBUG 04-22 16:09:51 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=509) DEBUG 04-22 16:09:51 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(APIServer pid=1) DEBUG 04-22 16:09:52 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=509) DEBUG 04-22 16:09:54 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/2f8ea44192/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=509) DEBUG 04-22 16:09:54 [compilation/backends.py:377] Store the 2-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_2', '/data/.cache/vllm/torch_compile_cache/2f8ea44192/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_2') -(EngineCore pid=509) DEBUG 04-22 16:09:54 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=509) DEBUG 04-22 16:09:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(EngineCore pid=509) DEBUG 04-22 16:09:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 2.2 ms -(EngineCore pid=509) DEBUG 04-22 16:09:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=509) DEBUG 04-22 16:09:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=509) DEBUG 04-22 16:09:57 [compilation/backends.py:377] Store the 5-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_5', '/data/.cache/vllm/torch_compile_cache/2f8ea44192/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_5') -(EngineCore pid=509) DEBUG 04-22 16:09:58 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=509) DEBUG 04-22 16:09:58 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.2 ms -(EngineCore pid=509) DEBUG 04-22 16:09:58 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.4 ms -(EngineCore pid=509) DEBUG 04-22 16:09:58 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=509) DEBUG 04-22 16:09:58 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(EngineCore pid=509) DEBUG 04-22 16:09:59 [compilation/backends.py:377] Store the 62-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_62', '/data/.cache/vllm/torch_compile_cache/2f8ea44192/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_62') -(EngineCore pid=509) INFO 04-22 16:09:59 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 12.34 s -(EngineCore pid=509) DEBUG 04-22 16:09:59 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/2f8ea44192/rank_0_0/backbone/computation_graph.py -(EngineCore pid=509) INFO 04-22 16:10:01 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/b84fd6dd09ba72e7f973ec2576a31cb1fbd82907c108bab96cab78bc2c00c365/rank_0_0/model -(EngineCore pid=509) INFO 04-22 16:10:01 [compilation/monitor.py:48] torch.compile took 25.25 s in total -(APIServer pid=1) DEBUG 04-22 16:10:02 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=509) INFO 04-22 16:10:02 [compilation/monitor.py:76] Initial profiling/warmup run took 0.48 s -(EngineCore pid=509) WARNING 04-22 16:10:08 [v1/core/kv_cache_utils.py:1059] Add 8 padding layers, may waste at most 15.38% KV cache memory -(EngineCore pid=509) INFO 04-22 16:10:08 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=509) DEBUG 04-22 16:10:08 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=509) INFO 04-22 16:10:08 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=509) DEBUG 04-22 16:10:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=509) DEBUG 04-22 16:10:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=509) DEBUG 04-22 16:10:09 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=509) DEBUG 04-22 16:10:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=509) DEBUG 04-22 16:10:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=509) DEBUG 04-22 16:10:09 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=509) DEBUG 04-22 16:10:09 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 172.00 MiB first-capture + (51-1) × 12.00 MiB per-graph -(EngineCore pid=509) DEBUG 04-22 16:10:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=509) DEBUG 04-22 16:10:10 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=509) DEBUG 04-22 16:10:10 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=509) DEBUG 04-22 16:10:10 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=509) DEBUG 04-22 16:10:10 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=509) DEBUG 04-22 16:10:10 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=509) DEBUG 04-22 16:10:10 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 8.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=509) DEBUG 04-22 16:10:11 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=509) INFO 04-22 16:10:11 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.05 GiB total -(EngineCore pid=509) DEBUG 04-22 16:10:11 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=509) DEBUG 04-22 16:10:11 [v1/worker/gpu_worker.py:430] Free memory after profiling: 26.33 GiB (total), 22.88 GiB (within requested) -(EngineCore pid=509) DEBUG 04-22 16:10:11 [v1/worker/gpu_worker.py:435] Memory profiling takes 35.21 seconds. Total non KV cache memory: 55.7GiB; torch peak memory increase: 3.99GiB; non-torch forward increase memory: 0.26GiB; weights memory: 51.45GiB. -(EngineCore pid=509) INFO 04-22 16:10:11 [v1/worker/gpu_worker.py:436] Available KV cache memory: 19.53 GiB -(EngineCore pid=509) INFO 04-22 16:10:11 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9632 to maintain the same effective KV cache size. -(EngineCore pid=509) WARNING 04-22 16:10:11 [v1/core/kv_cache_utils.py:1059] Add 8 padding layers, may waste at most 15.38% KV cache memory -(EngineCore pid=509) INFO 04-22 16:10:11 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 36,560 tokens -(EngineCore pid=509) INFO 04-22 16:10:11 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 4.46x -(EngineCore pid=509) 2026-04-22 16:10:11,385 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=509) DEBUG 04-22 16:10:11 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=509) 2026-04-22 16:10:11,403 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=509) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 15:50:19 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 15:50:19 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 15:50:19 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 15:50:19 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 15:50:19 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 15:50:19 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model google/gemma-3-4b-it -(APIServer pid=1) INFO 04-22 15:50:19 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 15:50:19 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 15:50:19 [entrypoints/utils.py:233] non-default args: {'model_tag': 'google/gemma-3-4b-it', 'model': 'google/gemma-3-4b-it', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 15:50:19 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) Traceback (most recent call last): -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_http.py", line 403, in hf_raise_for_status -(APIServer pid=1) response.raise_for_status() -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/requests/models.py", line 1028, in raise_for_status -(APIServer pid=1) raise HTTPError(http_error_msg, response=self) -(APIServer pid=1) requests.exceptions.HTTPError: 403 Client Error: Forbidden for url: https://huggingface.co/google/gemma-3-4b-it/resolve/main/config.json -(APIServer pid=1) -(APIServer pid=1) The above exception was the direct cause of the following exception: -(APIServer pid=1) -(APIServer pid=1) Traceback (most recent call last): -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1550, in _get_metadata_or_catch_error -(APIServer pid=1) metadata = get_hf_file_metadata( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn -(APIServer pid=1) return fn(*args, **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1467, in get_hf_file_metadata -(APIServer pid=1) r = _request_wrapper( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 283, in _request_wrapper -(APIServer pid=1) response = _request_wrapper( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 307, in _request_wrapper -(APIServer pid=1) hf_raise_for_status(response) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_http.py", line 467, in hf_raise_for_status -(APIServer pid=1) raise _format(HfHubHTTPError, message, response) from e -(APIServer pid=1) huggingface_hub.errors.HfHubHTTPError: (Request ID: Root=1-69e8ee3b-39f0338c303d9ca327360ac7;6d67260f-9456-4082-9f26-baaf306ee9c3) -(APIServer pid=1) -(APIServer pid=1) 403 Forbidden: Please enable access to public gated repositories in your fine-grained token settings to view this repository.. -(APIServer pid=1) Cannot access content at: https://huggingface.co/google/gemma-3-4b-it/resolve/main/config.json. -(APIServer pid=1) Make sure your token has the correct permissions. -(APIServer pid=1) -(APIServer pid=1) The above exception was the direct cause of the following exception: -(APIServer pid=1) -(APIServer pid=1) Traceback (most recent call last): -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 479, in cached_files -(APIServer pid=1) hf_hub_download( -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn -(APIServer pid=1) return fn(*args, **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1014, in hf_hub_download -(APIServer pid=1) return _hf_hub_download_to_cache_dir( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1121, in _hf_hub_download_to_cache_dir -(APIServer pid=1) _raise_on_head_call_error(head_call_error, force_download, local_files_only) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/huggingface_hub/file_download.py", line 1665, in _raise_on_head_call_error -(APIServer pid=1) raise LocalEntryNotFoundError( -(APIServer pid=1) huggingface_hub.errors.LocalEntryNotFoundError: An error happened while trying to locate the file on the Hub and we cannot find the requested files in the local cache. Please check your connection and try again or make sure your Internet connection is on. -(APIServer pid=1) -(APIServer pid=1) The above exception was the direct cause of the following exception: -(APIServer pid=1) -(APIServer pid=1) Traceback (most recent call last): -(APIServer pid=1) File "/usr/local/bin/vllm", line 10, in -(APIServer pid=1) sys.exit(main()) -(APIServer pid=1) ^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main -(APIServer pid=1) args.dispatch_function(args) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd -(APIServer pid=1) uvloop.run(run_server(args)) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run -(APIServer pid=1) return __asyncio.run( -(APIServer pid=1) ^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run -(APIServer pid=1) return runner.run(main) -(APIServer pid=1) ^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run -(APIServer pid=1) return self._loop.run_until_complete(task) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper -(APIServer pid=1) return await main -(APIServer pid=1) ^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server -(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker -(APIServer pid=1) async with build_async_engine_client( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ -(APIServer pid=1) return await anext(self.gen) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client -(APIServer pid=1) async with build_async_engine_client_from_engine_args( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ -(APIServer pid=1) return await anext(self.gen) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 124, in build_async_engine_client_from_engine_args -(APIServer pid=1) vllm_config = engine_args.create_engine_config(usage_context=usage_context) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/engine/arg_utils.py", line 1539, in create_engine_config -(APIServer pid=1) maybe_override_with_speculators( -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/transformers_utils/config.py", line 583, in maybe_override_with_speculators -(APIServer pid=1) config_dict, _ = PretrainedConfig.get_config_dict( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py", line 662, in get_config_dict -(APIServer pid=1) config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py", line 721, in _get_config_dict -(APIServer pid=1) resolved_config_file = cached_file( -(APIServer pid=1) ^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 322, in cached_file -(APIServer pid=1) file = cached_files(path_or_repo_id=path_or_repo_id, filenames=[filename], **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/transformers/utils/hub.py", line 553, in cached_files -(APIServer pid=1) raise OSError( -(APIServer pid=1) OSError: We couldn't connect to 'https://huggingface.co' to load the files, and couldn't find them in the cached files. -(APIServer pid=1) Check your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'. diff --git a/accuracy/results/v0.19.0/logs/google-gemma-3-4b-it--h100-80gb--tp1pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/google-gemma-3-4b-it--h100-80gb--tp1pp1dp1--8192.log deleted file mode 100644 index babe20f2..00000000 --- a/accuracy/results/v0.19.0/logs/google-gemma-3-4b-it--h100-80gb--tp1pp1dp1--8192.log +++ /dev/null @@ -1,766 +0,0 @@ -DEBUG 04-22 16:01:51 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 16:01:51 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 16:01:51 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 16:01:51 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 16:01:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 16:01:51 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 16:01:51 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 16:01:51 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 16:01:51 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 16:01:51 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 16:01:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 16:01:51 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 16:01:56 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 16:01:58 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' -DEBUG 04-22 16:01:58 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 16:01:58 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 16:01:58 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 16:01:58 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 16:01:58 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 16:01:58 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 16:01:58 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 16:01:58 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model google/gemma-3-4b-it -(APIServer pid=1) INFO 04-22 16:01:58 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 16:01:58 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 16:01:58 [entrypoints/utils.py:233] non-default args: {'model_tag': 'google/gemma-3-4b-it', 'model': 'google/gemma-3-4b-it', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 16:01:58 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 16:01:59 [model_executor/models/registry.py:774] Cached model info file for class vllm.model_executor.models.gemma3_mm.Gemma3ForConditionalGeneration not found -(APIServer pid=1) DEBUG 04-22 16:01:59 [model_executor/models/registry.py:834] Cache model info for class vllm.model_executor.models.gemma3_mm.Gemma3ForConditionalGeneration miss. Loading model instead. -(APIServer pid=1) DEBUG 04-22 16:02:09 [model_executor/models/registry.py:844] Loaded model info for class vllm.model_executor.models.gemma3_mm.Gemma3ForConditionalGeneration -(APIServer pid=1) DEBUG 04-22 16:02:09 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 9.9875990 secs -(APIServer pid=1) INFO 04-22 16:02:09 [config/model.py:549] Resolved architecture: Gemma3ForConditionalGeneration -(APIServer pid=1) INFO 04-22 16:02:09 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 16:02:09 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 16:02:09 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 16:02:09 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 16:02:09 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 16:02:09 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 16:02:09 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 16:02:09 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) WARNING 04-22 16:02:09 [platforms/cuda.py:199] Forcing --disable_chunked_mm_input for models with multimodal-bidirectional attention. -(APIServer pid=1) DEBUG 04-22 16:02:09 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 16:02:09 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 16:02:12 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 16:02:13 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. -(APIServer pid=1) DEBUG 04-22 16:02:13 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -(APIServer pid=1) Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. -DEBUG 04-22 16:02:25 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 16:02:25 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 16:02:25 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 16:02:25 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 16:02:25 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 16:02:25 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 16:02:25 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 16:02:25 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 16:02:25 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 16:02:25 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 16:02:25 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 16:02:25 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 16:02:30 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(APIServer pid=1) DEBUG 04-22 16:02:32 [v1/engine/utils.py:1042] Waiting for 1 local, 0 remote core engine proc(s) to connect. -(EngineCore pid=704) DEBUG 04-22 16:02:32 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 16:02:32 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=704) DEBUG 04-22 16:02:32 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/bb3f96a5-e878-4b7a-97bc-6b285c64279c'], outputs=['ipc:///tmp/f5d2b059-3379-4652-91b4-d539a064ea95'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=704) DEBUG 04-22 16:02:32 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=704) DEBUG 04-22 16:02:32 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=704) DEBUG 04-22 16:02:32 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=704) DEBUG 04-22 16:02:32 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=704) DEBUG 04-22 16:02:32 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=704) INFO 04-22 16:02:32 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='google/gemma-3-4b-it', speculative_config=None, tokenizer='google/gemma-3-4b-it', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=google/gemma-3-4b-it, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=704) DEBUG 04-22 16:02:32 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(EngineCore pid=704) DEBUG 04-22 16:02:34 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.10.218:44417 backend=nccl -(EngineCore pid=704) INFO 04-22 16:02:34 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.10.218:44417 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=704) DEBUG 04-22 16:02:34 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=704) INFO 04-22 16:02:34 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=704) DEBUG 04-22 16:02:34 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776873754.7606223, auto_measure=True -(EngineCore pid=704) DEBUG 04-22 16:02:34 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=704) DEBUG 04-22 16:02:34 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=704) DEBUG 04-22 16:02:34 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=704) DEBUG 04-22 16:02:34 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=704) DEBUG 04-22 16:02:34 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=704) DEBUG 04-22 16:02:34 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=704) DEBUG 04-22 16:02:34 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=704) DEBUG 04-22 16:02:34 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. -(EngineCore pid=704) Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`. -(APIServer pid=1) DEBUG 04-22 16:02:42 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=704) DEBUG 04-22 16:02:42 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=704) INFO 04-22 16:02:42 [v1/worker/gpu_model_runner.py:4735] Starting to load model google/gemma-3-4b-it... -(EngineCore pid=704) INFO 04-22 16:02:43 [model_executor/models/interfaces.py:171] Contains out of vocabulary multimodal tokens? False -(EngineCore pid=704) INFO 04-22 16:02:43 [platforms/cuda.py:390] Using backend AttentionBackendEnum.FLASH_ATTN for vit attention -(EngineCore pid=704) INFO 04-22 16:02:43 [model_executor/.../attention/mm_encoder_attention.py:230] Using AttentionBackendEnum.FLASH_ATTN for MMEncoderAttention. -(EngineCore pid=704) INFO 04-22 16:02:43 [config/vllm.py:790] Asynchronous scheduling is enabled. -(EngineCore pid=704) DEBUG 04-22 16:02:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=704) DEBUG 04-22 16:02:43 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=256, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=True, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {FLASH_ATTN: [partial multimodal token full attention not supported], FLASHINFER: [partial multimodal token full attention not supported]}. -(EngineCore pid=704) INFO 04-22 16:02:43 [platforms/cuda.py:334] Using TRITON_ATTN attention backend out of potential backends: ['TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=704) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=704) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=704) DEBUG 04-22 16:02:43 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=704) DEBUG 04-22 16:02:43 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 27}) -(EngineCore pid=704) DEBUG 04-22 16:02:43 [config/compilation.py:1195] disabled custom ops: Counter({'gemma_rms_norm': 206, 'gelu_and_mul': 34, 'rotary_embedding': 2, 'apply_rotary_emb': 2, 'conv2d': 1, 'vocab_parallel_embedding': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=704) DEBUG 04-22 16:02:43 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 27}) -(EngineCore pid=704) DEBUG 04-22 16:02:43 [config/compilation.py:1195] disabled custom ops: Counter({'gemma_rms_norm': 206, 'gelu_and_mul': 34, 'rotary_embedding': 2, 'apply_rotary_emb': 2, 'conv2d': 1, 'vocab_parallel_embedding': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=704) DEBUG 04-22 16:02:43 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=704) DEBUG 04-22 16:02:43 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00002.safetensors', 'model-00001-of-00002.safetensors']] -(APIServer pid=1) DEBUG 04-22 16:02:52 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=704) INFO 04-22 16:02:56 [model_executor/model_loader/weight_utils.py:581] Time spent downloading weights for google/gemma-3-4b-it: 12.245518 seconds -(EngineCore pid=704) Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00 -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/gemma3.py -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=704) INFO 04-22 16:03:10 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/51fc3e1252/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=fa200d5147 comp=e546579c48 code=616836617014c5f9fc7251fa87e8f62cc5716448a3f7628492691f84ac2574d9 dir=/data/.cache/vllm/torch_compile_cache/51fc3e1252/rank_0_0/backbone -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/backends.py:1074] Vllm config hash: fa200d5147 -(EngineCore pid=704) INFO 04-22 16:03:10 [compilation/backends.py:1111] Dynamo bytecode transform time: 5.58 s -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=704) DEBUG 04-22 16:03:10 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=704) DEBUG 04-22 16:03:11 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=704) DEBUG 04-22 16:03:11 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms -(EngineCore pid=704) DEBUG 04-22 16:03:11 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.2 ms -(EngineCore pid=704) DEBUG 04-22 16:03:11 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=704) DEBUG 04-22 16:03:11 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(APIServer pid=1) DEBUG 04-22 16:03:12 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=704) INFO 04-22 16:03:15 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=704) DEBUG 04-22 16:03:15 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/51fc3e1252/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=704) DEBUG 04-22 16:03:16 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=704) DEBUG 04-22 16:03:16 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(EngineCore pid=704) DEBUG 04-22 16:03:16 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.8 ms -(EngineCore pid=704) DEBUG 04-22 16:03:16 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=704) DEBUG 04-22 16:03:16 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=704) DEBUG 04-22 16:03:18 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/51fc3e1252/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=704) DEBUG 04-22 16:03:19 [compilation/backends.py:377] Store the 2-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_2', '/data/.cache/vllm/torch_compile_cache/51fc3e1252/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_2') -(EngineCore pid=704) DEBUG 04-22 16:03:19 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=704) DEBUG 04-22 16:03:19 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(EngineCore pid=704) DEBUG 04-22 16:03:19 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.7 ms -(EngineCore pid=704) DEBUG 04-22 16:03:19 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=704) DEBUG 04-22 16:03:19 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=704) DEBUG 04-22 16:03:22 [compilation/backends.py:377] Store the 5-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_5', '/data/.cache/vllm/torch_compile_cache/51fc3e1252/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_5') -(APIServer pid=1) DEBUG 04-22 16:03:22 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=704) DEBUG 04-22 16:03:22 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=704) DEBUG 04-22 16:03:22 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.2 ms -(EngineCore pid=704) DEBUG 04-22 16:03:22 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.4 ms -(EngineCore pid=704) DEBUG 04-22 16:03:22 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=704) DEBUG 04-22 16:03:22 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(EngineCore pid=704) DEBUG 04-22 16:03:23 [compilation/backends.py:377] Store the 34-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_34', '/data/.cache/vllm/torch_compile_cache/51fc3e1252/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_34') -(EngineCore pid=704) INFO 04-22 16:03:23 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 12.51 s -(EngineCore pid=704) DEBUG 04-22 16:03:23 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/51fc3e1252/rank_0_0/backbone/computation_graph.py -(EngineCore pid=704) INFO 04-22 16:03:25 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/bf6f8b734858fcb15213f9ee93845ad6e23d10da81c12f4ce6ecf561a8822c30/rank_0_0/model -(EngineCore pid=704) INFO 04-22 16:03:25 [compilation/monitor.py:48] torch.compile took 20.04 s in total -(EngineCore pid=704) INFO 04-22 16:03:25 [compilation/monitor.py:76] Initial profiling/warmup run took 0.54 s -(EngineCore pid=704) WARNING 04-22 16:03:31 [v1/core/kv_cache_utils.py:1059] Add 1 padding layers, may waste at most 3.45% KV cache memory -(EngineCore pid=704) INFO 04-22 16:03:31 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=704) DEBUG 04-22 16:03:31 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=704) INFO 04-22 16:03:31 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=704) DEBUG 04-22 16:03:31 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=704) DEBUG 04-22 16:03:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=704) DEBUG 04-22 16:03:32 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=704) DEBUG 04-22 16:03:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=704) DEBUG 04-22 16:03:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=704) DEBUG 04-22 16:03:32 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=704) DEBUG 04-22 16:03:32 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 112.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=704) DEBUG 04-22 16:03:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(APIServer pid=1) DEBUG 04-22 16:03:32 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=704) DEBUG 04-22 16:03:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=704) DEBUG 04-22 16:03:33 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=704) DEBUG 04-22 16:03:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=704) DEBUG 04-22 16:03:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=704) DEBUG 04-22 16:03:33 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=704) DEBUG 04-22 16:03:33 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 4.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=704) DEBUG 04-22 16:03:33 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=704) INFO 04-22 16:03:33 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.70 GiB total -(EngineCore pid=704) DEBUG 04-22 16:03:34 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=704) DEBUG 04-22 16:03:34 [v1/worker/gpu_worker.py:430] Free memory after profiling: 69.27 GiB (total), 65.82 GiB (within requested) -(EngineCore pid=704) DEBUG 04-22 16:03:34 [v1/worker/gpu_worker.py:435] Memory profiling takes 29.72 seconds. Total non KV cache memory: 12.72GiB; torch peak memory increase: 3.89GiB; non-torch forward increase memory: 0.25GiB; weights memory: 8.58GiB. -(EngineCore pid=704) INFO 04-22 16:03:34 [v1/worker/gpu_worker.py:436] Available KV cache memory: 62.51 GiB -(EngineCore pid=704) INFO 04-22 16:03:34 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9588 to maintain the same effective KV cache size. -(EngineCore pid=704) WARNING 04-22 16:03:34 [v1/core/kv_cache_utils.py:1059] Add 1 padding layers, may waste at most 3.45% KV cache memory -(EngineCore pid=704) INFO 04-22 16:03:34 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 468,144 tokens -(EngineCore pid=704) INFO 04-22 16:03:34 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 57.05x -(EngineCore pid=704) 2026-04-22 16:03:34,271 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=704) DEBUG 04-22 16:03:34 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=704) 2026-04-22 16:03:34,282 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=704) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-23 01:03:13 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-23 01:03:13 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-23 01:03:13 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-23 01:03:13 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-23 01:03:13 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-23 01:03:13 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model google/gemma-4-E4B-it -(APIServer pid=1) INFO 04-23 01:03:13 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-23 01:03:13 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-23 01:03:13 [entrypoints/utils.py:233] non-default args: {'model_tag': 'google/gemma-4-E4B-it', 'model': 'google/gemma-4-E4B-it', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-23 01:03:13 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) Traceback (most recent call last): -(APIServer pid=1) File "/usr/local/bin/vllm", line 10, in -(APIServer pid=1) sys.exit(main()) -(APIServer pid=1) ^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main -(APIServer pid=1) args.dispatch_function(args) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd -(APIServer pid=1) uvloop.run(run_server(args)) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run -(APIServer pid=1) return __asyncio.run( -(APIServer pid=1) ^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run -(APIServer pid=1) return runner.run(main) -(APIServer pid=1) ^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run -(APIServer pid=1) return self._loop.run_until_complete(task) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper -(APIServer pid=1) return await main -(APIServer pid=1) ^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server -(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker -(APIServer pid=1) async with build_async_engine_client( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ -(APIServer pid=1) return await anext(self.gen) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client -(APIServer pid=1) async with build_async_engine_client_from_engine_args( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ -(APIServer pid=1) return await anext(self.gen) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 124, in build_async_engine_client_from_engine_args -(APIServer pid=1) vllm_config = engine_args.create_engine_config(usage_context=usage_context) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/engine/arg_utils.py", line 1549, in create_engine_config -(APIServer pid=1) model_config = self.create_model_config() -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/engine/arg_utils.py", line 1398, in create_model_config -(APIServer pid=1) return ModelConfig( -(APIServer pid=1) ^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/pydantic/_internal/_dataclasses.py", line 121, in __init__ -(APIServer pid=1) s.__pydantic_validator__.validate_python(ArgsKwargs(args, kwargs), self_instance=s) -(APIServer pid=1) pydantic_core._pydantic_core.ValidationError: 1 validation error for ModelConfig -(APIServer pid=1) Value error, The checkpoint you are trying to load has model type `gemma4` but Transformers does not recognize this architecture. This could be because of an issue with the checkpoint, or because your version of Transformers is out of date. -(APIServer pid=1) -(APIServer pid=1) You can update Transformers with the command `pip install --upgrade transformers`. If this does not work, and the checkpoint is very new, then there may not be a release version that supports this model yet. In this case, you can get the most up-to-date code by installing Transformers from source with the command `pip install git+https://github.com/huggingface/transformers.git` [type=value_error, input_value=ArgsKwargs((), {'model': ...nderer_num_workers': 1}), input_type=ArgsKwargs] -(APIServer pid=1) For further information visit https://errors.pydantic.dev/2.12/v/value_error diff --git a/accuracy/results/v0.19.0/logs/google-gemma-7b--h100-80gb--tp1pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/google-gemma-7b--h100-80gb--tp1pp1dp1--8192.log deleted file mode 100644 index c939bdb3..00000000 --- a/accuracy/results/v0.19.0/logs/google-gemma-7b--h100-80gb--tp1pp1dp1--8192.log +++ /dev/null @@ -1,776 +0,0 @@ -DEBUG 04-22 19:55:20 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 19:55:20 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 19:55:20 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 19:55:20 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 19:55:20 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 19:55:20 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 19:55:20 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 19:55:20 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 19:55:20 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 19:55:20 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 19:55:20 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 19:55:20 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 19:55:25 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 19:55:27 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' -DEBUG 04-22 19:55:27 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 19:55:27 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 19:55:27 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 19:55:27 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 19:55:27 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 19:55:27 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 19:55:27 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 19:55:27 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model google/gemma-7b -(APIServer pid=1) INFO 04-22 19:55:27 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 19:55:27 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 19:55:27 [entrypoints/utils.py:233] non-default args: {'model_tag': 'google/gemma-7b', 'model': 'google/gemma-7b', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 19:55:27 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 19:55:28 [model_executor/models/registry.py:774] Cached model info file for class vllm.model_executor.models.gemma.GemmaForCausalLM not found -(APIServer pid=1) DEBUG 04-22 19:55:28 [model_executor/models/registry.py:834] Cache model info for class vllm.model_executor.models.gemma.GemmaForCausalLM miss. Loading model instead. -(APIServer pid=1) DEBUG 04-22 19:55:37 [model_executor/models/registry.py:844] Loaded model info for class vllm.model_executor.models.gemma.GemmaForCausalLM -(APIServer pid=1) DEBUG 04-22 19:55:38 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 9.7641226 secs -(APIServer pid=1) INFO 04-22 19:55:38 [config/model.py:549] Resolved architecture: GemmaForCausalLM -(APIServer pid=1) INFO 04-22 19:55:38 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 19:55:38 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 19:55:38 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 19:55:38 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 19:55:38 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 19:55:38 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 19:55:38 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 19:55:38 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 19:55:38 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 19:55:38 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 19:55:42 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 19:55:42 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 19:55:45 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 19:55:45 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 19:55:45 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 19:55:45 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 19:55:45 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 19:55:45 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 19:55:45 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 19:55:45 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 19:55:45 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 19:55:45 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 19:55:45 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 19:55:45 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 19:55:50 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(APIServer pid=1) DEBUG 04-22 19:55:52 [v1/engine/utils.py:1042] Waiting for 1 local, 0 remote core engine proc(s) to connect. -(EngineCore pid=476) DEBUG 04-22 19:55:52 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 19:55:52 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=476) DEBUG 04-22 19:55:52 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/ffb07a5d-fa70-4ff0-a712-25f0afaf7ecc'], outputs=['ipc:///tmp/aae2e87c-8846-4ae1-96f9-87bec80aa3ae'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=476) DEBUG 04-22 19:55:52 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=476) DEBUG 04-22 19:55:52 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=476) DEBUG 04-22 19:55:52 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=476) DEBUG 04-22 19:55:52 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=476) DEBUG 04-22 19:55:52 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=476) INFO 04-22 19:55:52 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='google/gemma-7b', speculative_config=None, tokenizer='google/gemma-7b', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=google/gemma-7b, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=476) DEBUG 04-22 19:55:52 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.130.3.159:52289 backend=nccl -(EngineCore pid=476) INFO 04-22 19:55:52 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.130.3.159:52289 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=476) DEBUG 04-22 19:55:52 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=476) INFO 04-22 19:55:52 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=476) DEBUG 04-22 19:55:53 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776887753.2019594, auto_measure=True -(EngineCore pid=476) DEBUG 04-22 19:55:53 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=476) DEBUG 04-22 19:55:53 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=476) DEBUG 04-22 19:55:53 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=476) DEBUG 04-22 19:55:53 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=476) DEBUG 04-22 19:55:53 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=476) DEBUG 04-22 19:55:53 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=476) DEBUG 04-22 19:55:53 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=476) DEBUG 04-22 19:55:53 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=476) DEBUG 04-22 19:55:53 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=476) INFO 04-22 19:55:53 [v1/worker/gpu_model_runner.py:4735] Starting to load model google/gemma-7b... -(EngineCore pid=476) DEBUG 04-22 19:55:54 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=256, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=476) INFO 04-22 19:55:54 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=476) INFO 04-22 19:55:54 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=476) WARNING 04-22 19:55:54 [model_executor/models/gemma.py:67] Gemma's activation function was incorrectly set to exact GeLU in the config JSON file when it was initially released. Changing the activation function to approximate GeLU (`gelu_pytorch_tanh`). If you want to use the legacy `gelu`, edit the config JSON to set `hidden_activation=gelu` instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details. -(EngineCore pid=476) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=476) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=476) DEBUG 04-22 19:55:54 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=476) DEBUG 04-22 19:55:54 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=476) DEBUG 04-22 19:55:54 [config/compilation.py:1195] disabled custom ops: Counter({'gemma_rms_norm': 57, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'gelu_and_mul': 1, 'logits_processor': 1}) -(EngineCore pid=476) DEBUG 04-22 19:55:54 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=476) DEBUG 04-22 19:55:54 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00004-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'model-00002-of-00004.safetensors']] -(APIServer pid=1) DEBUG 04-22 19:56:02 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 19:56:12 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 19:56:22 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 19:56:32 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 19:56:42 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 19:56:52 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 19:57:02 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 19:57:12 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 19:57:22 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=476) INFO 04-22 19:57:28 [model_executor/model_loader/weight_utils.py:581] Time spent downloading weights for google/gemma-7b: 94.085020 seconds -(EngineCore pid=476) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/gemma.py -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=476) INFO 04-22 19:57:47 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/3e4d7bd8e6/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=0835f40ede comp=e546579c48 code=d159a3198a8d5227bb8beeacc16de5095b5713eb9baa852b8ce11e4d03b46410 dir=/data/.cache/vllm/torch_compile_cache/3e4d7bd8e6/rank_0_0/backbone -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=476) DEBUG 04-22 19:57:47 [compilation/backends.py:1074] Vllm config hash: 0835f40ede -(EngineCore pid=476) INFO 04-22 19:57:47 [compilation/backends.py:1111] Dynamo bytecode transform time: 3.44 s -(EngineCore pid=476) DEBUG 04-22 19:57:48 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=476) DEBUG 04-22 19:57:48 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=476) DEBUG 04-22 19:57:48 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=476) DEBUG 04-22 19:57:48 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(EngineCore pid=476) DEBUG 04-22 19:57:48 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.7 ms -(EngineCore pid=476) DEBUG 04-22 19:57:48 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=476) DEBUG 04-22 19:57:48 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=476) INFO 04-22 19:57:49 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=476) DEBUG 04-22 19:57:49 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/3e4d7bd8e6/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=476) DEBUG 04-22 19:57:49 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=476) DEBUG 04-22 19:57:49 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(EngineCore pid=476) DEBUG 04-22 19:57:49 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.3 ms -(EngineCore pid=476) DEBUG 04-22 19:57:49 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=476) DEBUG 04-22 19:57:49 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=476) DEBUG 04-22 19:57:50 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/3e4d7bd8e6/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=476) DEBUG 04-22 19:57:51 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=476) DEBUG 04-22 19:57:51 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.1 ms -(EngineCore pid=476) DEBUG 04-22 19:57:51 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.2 ms -(EngineCore pid=476) DEBUG 04-22 19:57:51 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=476) DEBUG 04-22 19:57:51 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(EngineCore pid=476) DEBUG 04-22 19:57:51 [compilation/backends.py:377] Store the 28-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_28', '/data/.cache/vllm/torch_compile_cache/3e4d7bd8e6/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_28') -(EngineCore pid=476) INFO 04-22 19:57:51 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 3.92 s -(EngineCore pid=476) DEBUG 04-22 19:57:52 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/3e4d7bd8e6/rank_0_0/backbone/computation_graph.py -(APIServer pid=1) DEBUG 04-22 19:57:52 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=476) INFO 04-22 19:57:53 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/cfd90f7b6e9eca63e8913919f609c7f6cdef60949c23277267eda847f1fcac03/rank_0_0/model -(EngineCore pid=476) INFO 04-22 19:57:53 [compilation/monitor.py:48] torch.compile took 8.99 s in total -(EngineCore pid=476) INFO 04-22 19:57:53 [compilation/monitor.py:76] Initial profiling/warmup run took 0.52 s -(EngineCore pid=476) INFO 04-22 19:57:59 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=476) DEBUG 04-22 19:57:59 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=476) INFO 04-22 19:57:59 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=476) DEBUG 04-22 19:57:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=476) DEBUG 04-22 19:57:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=476) DEBUG 04-22 19:57:59 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=476) DEBUG 04-22 19:57:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=476) DEBUG 04-22 19:57:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=476) DEBUG 04-22 19:57:59 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=476) DEBUG 04-22 19:57:59 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 172.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=476) DEBUG 04-22 19:57:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=476) DEBUG 04-22 19:57:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=476) DEBUG 04-22 19:57:59 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=476) DEBUG 04-22 19:57:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=476) DEBUG 04-22 19:57:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=476) DEBUG 04-22 19:57:59 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=476) DEBUG 04-22 19:57:59 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 262.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=476) DEBUG 04-22 19:57:59 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=476) INFO 04-22 19:57:59 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.84 GiB total -(EngineCore pid=476) DEBUG 04-22 19:58:00 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=476) DEBUG 04-22 19:58:00 [v1/worker/gpu_worker.py:430] Free memory after profiling: 60.94 GiB (total), 57.5 GiB (within requested) -(EngineCore pid=476) DEBUG 04-22 19:58:00 [v1/worker/gpu_worker.py:435] Memory profiling takes 16.01 seconds. Total non KV cache memory: 19.79GiB; torch peak memory increase: 3.63GiB; non-torch forward increase memory: 0.25GiB; weights memory: 15.91GiB. -(EngineCore pid=476) INFO 04-22 19:58:00 [v1/worker/gpu_worker.py:436] Available KV cache memory: 55.44 GiB -(EngineCore pid=476) INFO 04-22 19:58:00 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9606 to maintain the same effective KV cache size. -(EngineCore pid=476) INFO 04-22 19:58:00 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 129,760 tokens -(EngineCore pid=476) INFO 04-22 19:58:00 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 15.84x -(EngineCore pid=476) 2026-04-22 19:58:00,118 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=476) DEBUG 04-22 19:58:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=476) 2026-04-22 19:58:00,126 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=476) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:47:43 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:47:43 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 00:47:43 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:47:43 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 00:47:43 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 00:47:43 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model ibm-granite/granite-3.1-2b-instruct -(APIServer pid=1) INFO 04-22 00:47:43 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 00:47:43 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:47:43 [entrypoints/utils.py:233] non-default args: {'model_tag': 'ibm-granite/granite-3.1-2b-instruct', 'model': 'ibm-granite/granite-3.1-2b-instruct', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 00:47:43 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 00:47:43 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.granite.GraniteForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 00:47:43 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0014069 secs -(APIServer pid=1) INFO 04-22 00:47:43 [config/model.py:549] Resolved architecture: GraniteForCausalLM -(APIServer pid=1) INFO 04-22 00:47:43 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 00:47:43 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 00:47:43 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 00:47:43 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 00:47:43 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 00:47:43 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 00:47:43 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 00:47:43 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 00:47:43 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 00:47:43 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:47:43 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:47:43 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 00:47:47 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:47:47 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:47:47 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:47:47 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:47:47 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:47:47 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:47:47 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:47:47 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:47:47 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:47:47 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:47:47 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:47:47 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:47:52 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=241) DEBUG 04-22 00:47:53 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 00:47:53 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=241) DEBUG 04-22 00:47:53 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/0349d557-8ab3-47d9-9ad8-c814627504f4'], outputs=['ipc:///tmp/b76a23fe-76f6-4c98-b58e-8cebef12265e'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=241) DEBUG 04-22 00:47:53 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=241) DEBUG 04-22 00:47:53 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=241) DEBUG 04-22 00:47:53 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=241) DEBUG 04-22 00:47:53 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=241) DEBUG 04-22 00:47:53 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=241) INFO 04-22 00:47:53 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='ibm-granite/granite-3.1-2b-instruct', speculative_config=None, tokenizer='ibm-granite/granite-3.1-2b-instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=ibm-granite/granite-3.1-2b-instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=241) DEBUG 04-22 00:47:54 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.131.7.80:51203 backend=nccl -(EngineCore pid=241) INFO 04-22 00:47:54 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.131.7.80:51203 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=241) DEBUG 04-22 00:47:54 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=241) INFO 04-22 00:47:54 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=241) DEBUG 04-22 00:47:54 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776818874.7326565, auto_measure=True -(EngineCore pid=241) DEBUG 04-22 00:47:54 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=241) DEBUG 04-22 00:47:54 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=241) DEBUG 04-22 00:47:54 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=241) DEBUG 04-22 00:47:54 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=241) DEBUG 04-22 00:47:54 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=241) DEBUG 04-22 00:47:54 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=241) DEBUG 04-22 00:47:54 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=241) DEBUG 04-22 00:47:54 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=241) DEBUG 04-22 00:47:54 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=241) INFO 04-22 00:47:54 [v1/worker/gpu_model_runner.py:4735] Starting to load model ibm-granite/granite-3.1-2b-instruct... -(EngineCore pid=241) DEBUG 04-22 00:47:55 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=64, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=241) INFO 04-22 00:47:55 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=241) INFO 04-22 00:47:55 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=241) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=241) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=241) DEBUG 04-22 00:47:55 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=241) DEBUG 04-22 00:47:55 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=241) DEBUG 04-22 00:47:55 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 81, 'silu_and_mul': 40, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=241) DEBUG 04-22 00:47:55 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=241) DEBUG 04-22 00:47:55 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00001-of-00002.safetensors', 'model-00002-of-00002.safetensors']] -(EngineCore pid=241) Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00 -(APIServer pid=1) DEBUG 04-22 00:48:03 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/granite.py -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=241) INFO 04-22 00:48:05 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/2cc6b1f3b1/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=bf3c110cf4 comp=e546579c48 code=9615bc1b6a3cdadf99f40f12188b347f2842282ef159341e824808308c14a2aa dir=/data/.cache/vllm/torch_compile_cache/2cc6b1f3b1/rank_0_0/backbone -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=241) DEBUG 04-22 00:48:05 [compilation/backends.py:1074] Vllm config hash: bf3c110cf4 -(EngineCore pid=241) INFO 04-22 00:48:05 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.92 s -(EngineCore pid=241) DEBUG 04-22 00:48:06 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=241) DEBUG 04-22 00:48:06 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=241) DEBUG 04-22 00:48:06 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=241) DEBUG 04-22 00:48:06 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(EngineCore pid=241) DEBUG 04-22 00:48:06 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms -(EngineCore pid=241) DEBUG 04-22 00:48:06 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=241) DEBUG 04-22 00:48:06 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=241) INFO 04-22 00:48:08 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=241) DEBUG 04-22 00:48:08 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/2cc6b1f3b1/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=241) DEBUG 04-22 00:48:08 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=241) DEBUG 04-22 00:48:08 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(EngineCore pid=241) DEBUG 04-22 00:48:08 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms -(EngineCore pid=241) DEBUG 04-22 00:48:08 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=241) DEBUG 04-22 00:48:08 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=241) DEBUG 04-22 00:48:09 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/2cc6b1f3b1/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=241) DEBUG 04-22 00:48:11 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=241) DEBUG 04-22 00:48:11 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.0 ms -(EngineCore pid=241) DEBUG 04-22 00:48:11 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms -(EngineCore pid=241) DEBUG 04-22 00:48:11 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=241) DEBUG 04-22 00:48:11 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(EngineCore pid=241) DEBUG 04-22 00:48:11 [compilation/backends.py:377] Store the 40-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_40', '/data/.cache/vllm/torch_compile_cache/2cc6b1f3b1/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_40') -(EngineCore pid=241) INFO 04-22 00:48:11 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.36 s -(EngineCore pid=241) DEBUG 04-22 00:48:11 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/2cc6b1f3b1/rank_0_0/backbone/computation_graph.py -(EngineCore pid=241) INFO 04-22 00:48:12 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/d5e5b1f5dbb9fccea6e0b08fa936f7a8d04b152a219e5cf2027018b7be736bf9/rank_0_0/model -(EngineCore pid=241) INFO 04-22 00:48:12 [compilation/monitor.py:48] torch.compile took 11.58 s in total -(EngineCore pid=241) INFO 04-22 00:48:12 [compilation/monitor.py:76] Initial profiling/warmup run took 0.18 s -(APIServer pid=1) DEBUG 04-22 00:48:13 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=241) INFO 04-22 00:48:21 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=241) DEBUG 04-22 00:48:21 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=241) INFO 04-22 00:48:21 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=241) DEBUG 04-22 00:48:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=241) DEBUG 04-22 00:48:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=241) DEBUG 04-22 00:48:21 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=241) DEBUG 04-22 00:48:22 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=241) DEBUG 04-22 00:48:22 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=241) DEBUG 04-22 00:48:22 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=241) DEBUG 04-22 00:48:22 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 100.00 MiB first-capture + (51-1) × 26.00 MiB per-graph -(EngineCore pid=241) DEBUG 04-22 00:48:22 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=241) DEBUG 04-22 00:48:22 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=241) DEBUG 04-22 00:48:22 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=241) DEBUG 04-22 00:48:22 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=241) DEBUG 04-22 00:48:22 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=241) DEBUG 04-22 00:48:22 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=241) DEBUG 04-22 00:48:22 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 134.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(EngineCore pid=241) DEBUG 04-22 00:48:22 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=241) INFO 04-22 00:48:22 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.60 GiB total -(EngineCore pid=241) DEBUG 04-22 00:48:23 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=241) DEBUG 04-22 00:48:23 [v1/worker/gpu_worker.py:430] Free memory after profiling: 73.15 GiB (total), 69.7 GiB (within requested) -(EngineCore pid=241) DEBUG 04-22 00:48:23 [v1/worker/gpu_worker.py:435] Memory profiling takes 22.35 seconds. Total non KV cache memory: 5.95GiB; torch peak memory increase: 0.75GiB; non-torch forward increase memory: 0.46GiB; weights memory: 4.74GiB. -(EngineCore pid=241) INFO 04-22 00:48:23 [v1/worker/gpu_worker.py:436] Available KV cache memory: 69.28 GiB -(EngineCore pid=241) INFO 04-22 00:48:23 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9702 to maintain the same effective KV cache size. -(EngineCore pid=241) INFO 04-22 00:48:23 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 908,048 tokens -(EngineCore pid=241) INFO 04-22 00:48:23 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 110.85x -(EngineCore pid=241) 2026-04-22 00:48:23,286 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=241) DEBUG 04-22 00:48:23 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=241) 2026-04-22 00:48:23,300 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=241) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:48:47 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:48:47 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 00:48:47 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:48:47 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 00:48:47 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 00:48:47 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model ibm-granite/granite-3.1-8b-instruct -(APIServer pid=1) INFO 04-22 00:48:47 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 00:48:47 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:48:47 [entrypoints/utils.py:233] non-default args: {'model_tag': 'ibm-granite/granite-3.1-8b-instruct', 'model': 'ibm-granite/granite-3.1-8b-instruct', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 00:48:47 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 00:48:48 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.granite.GraniteForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 00:48:48 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0010872 secs -(APIServer pid=1) INFO 04-22 00:48:48 [config/model.py:549] Resolved architecture: GraniteForCausalLM -(APIServer pid=1) INFO 04-22 00:48:48 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 00:48:48 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 00:48:48 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 00:48:48 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 00:48:48 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 00:48:48 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 00:48:48 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 00:48:48 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 00:48:48 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 00:48:48 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:48:48 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:48:48 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 00:48:52 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:48:52 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:48:52 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:48:52 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:48:52 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:48:52 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:48:52 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:48:52 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:48:52 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:48:52 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:48:52 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:48:52 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:48:57 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=243) DEBUG 04-22 00:48:58 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 00:48:58 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=243) DEBUG 04-22 00:48:58 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/fa12e5fc-6b20-4e7a-b9ac-6e0b03ec1faf'], outputs=['ipc:///tmp/41c5cdb6-ddb7-45b0-84ef-bb868b12ef8f'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=243) DEBUG 04-22 00:48:58 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=243) DEBUG 04-22 00:48:58 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=243) DEBUG 04-22 00:48:58 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=243) DEBUG 04-22 00:48:58 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=243) DEBUG 04-22 00:48:58 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=243) INFO 04-22 00:48:58 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='ibm-granite/granite-3.1-8b-instruct', speculative_config=None, tokenizer='ibm-granite/granite-3.1-8b-instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=ibm-granite/granite-3.1-8b-instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=243) DEBUG 04-22 00:48:59 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.7.31:48457 backend=nccl -(EngineCore pid=243) INFO 04-22 00:48:59 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.7.31:48457 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=243) DEBUG 04-22 00:48:59 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=243) INFO 04-22 00:48:59 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=243) DEBUG 04-22 00:48:59 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776818939.6237643, auto_measure=True -(EngineCore pid=243) DEBUG 04-22 00:48:59 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=243) DEBUG 04-22 00:48:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=243) DEBUG 04-22 00:48:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=243) DEBUG 04-22 00:48:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=243) DEBUG 04-22 00:48:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=243) DEBUG 04-22 00:48:59 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=243) DEBUG 04-22 00:48:59 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=243) DEBUG 04-22 00:48:59 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=243) DEBUG 04-22 00:48:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=243) INFO 04-22 00:48:59 [v1/worker/gpu_model_runner.py:4735] Starting to load model ibm-granite/granite-3.1-8b-instruct... -(EngineCore pid=243) DEBUG 04-22 00:49:00 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=243) INFO 04-22 00:49:00 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=243) INFO 04-22 00:49:00 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=243) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=243) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=243) DEBUG 04-22 00:49:00 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=243) DEBUG 04-22 00:49:00 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=243) DEBUG 04-22 00:49:00 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 81, 'silu_and_mul': 40, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=243) DEBUG 04-22 00:49:00 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=243) DEBUG 04-22 00:49:00 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00001-of-00004.safetensors', 'model-00002-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00003-of-00004.safetensors']] -(EngineCore pid=243) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 -(APIServer pid=1) DEBUG 04-22 00:49:18 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/granite.py -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=243) INFO 04-22 00:49:20 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/accd715892/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=9678c869f9 comp=e546579c48 code=9615bc1b6a3cdadf99f40f12188b347f2842282ef159341e824808308c14a2aa dir=/data/.cache/vllm/torch_compile_cache/accd715892/rank_0_0/backbone -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/backends.py:1074] Vllm config hash: 9678c869f9 -(EngineCore pid=243) INFO 04-22 00:49:20 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.97 s -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=243) DEBUG 04-22 00:49:20 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=243) DEBUG 04-22 00:49:21 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 00:49:21 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(EngineCore pid=243) DEBUG 04-22 00:49:21 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms -(EngineCore pid=243) DEBUG 04-22 00:49:21 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 00:49:21 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=243) INFO 04-22 00:49:23 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=243) DEBUG 04-22 00:49:23 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/accd715892/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=243) DEBUG 04-22 00:49:23 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 00:49:23 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(EngineCore pid=243) DEBUG 04-22 00:49:23 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms -(EngineCore pid=243) DEBUG 04-22 00:49:23 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 00:49:23 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=243) DEBUG 04-22 00:49:24 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/accd715892/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=243) DEBUG 04-22 00:49:25 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 00:49:25 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.0 ms -(EngineCore pid=243) DEBUG 04-22 00:49:25 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.0 ms -(EngineCore pid=243) DEBUG 04-22 00:49:25 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 00:49:25 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(EngineCore pid=243) DEBUG 04-22 00:49:26 [compilation/backends.py:377] Store the 40-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_40', '/data/.cache/vllm/torch_compile_cache/accd715892/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_40') -(EngineCore pid=243) INFO 04-22 00:49:26 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.82 s -(EngineCore pid=243) DEBUG 04-22 00:49:26 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/accd715892/rank_0_0/backbone/computation_graph.py -(EngineCore pid=243) INFO 04-22 00:49:27 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/3fa591bfa994b8ea54e988e8287be9c45f124a0ed4b24d35f98d24f4e34631fa/rank_0_0/model -(EngineCore pid=243) INFO 04-22 00:49:27 [compilation/monitor.py:48] torch.compile took 12.08 s in total -(EngineCore pid=243) INFO 04-22 00:49:28 [compilation/monitor.py:76] Initial profiling/warmup run took 0.48 s -(APIServer pid=1) DEBUG 04-22 00:49:28 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=243) INFO 04-22 00:49:36 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=243) DEBUG 04-22 00:49:36 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=243) INFO 04-22 00:49:36 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=243) DEBUG 04-22 00:49:37 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:49:37 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:49:37 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 00:49:37 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:49:37 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:49:37 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 00:49:37 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 124.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=243) DEBUG 04-22 00:49:37 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:49:37 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:49:37 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 00:49:37 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:49:37 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:49:37 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 00:49:37 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 262.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(EngineCore pid=243) DEBUG 04-22 00:49:37 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=243) INFO 04-22 00:49:37 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.74 GiB total -(EngineCore pid=243) DEBUG 04-22 00:49:37 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=243) DEBUG 04-22 00:49:37 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.7 GiB (total), 59.26 GiB (within requested) -(EngineCore pid=243) DEBUG 04-22 00:49:37 [v1/worker/gpu_worker.py:435] Memory profiling takes 22.42 seconds. Total non KV cache memory: 16.57GiB; torch peak memory increase: 0.85GiB; non-torch forward increase memory: 0.46GiB; weights memory: 15.25GiB. -(EngineCore pid=243) INFO 04-22 00:49:37 [v1/worker/gpu_worker.py:436] Available KV cache memory: 58.66 GiB -(EngineCore pid=243) INFO 04-22 00:49:37 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9594 to maintain the same effective KV cache size. -(EngineCore pid=243) INFO 04-22 00:49:37 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 384,432 tokens -(EngineCore pid=243) INFO 04-22 00:49:37 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 46.93x -(EngineCore pid=243) 2026-04-22 00:49:38,019 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=243) DEBUG 04-22 00:49:38 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) 2026-04-22 00:49:38,031 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=243) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:50:05 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:50:05 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 00:50:05 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:50:05 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 00:50:05 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 00:50:05 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model ibm-granite/granite-3.3-8b-instruct -(APIServer pid=1) INFO 04-22 00:50:05 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 00:50:05 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:50:05 [entrypoints/utils.py:233] non-default args: {'model_tag': 'ibm-granite/granite-3.3-8b-instruct', 'model': 'ibm-granite/granite-3.3-8b-instruct', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 00:50:05 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 00:50:06 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.granite.GraniteForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 00:50:06 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003631 secs -(APIServer pid=1) INFO 04-22 00:50:06 [config/model.py:549] Resolved architecture: GraniteForCausalLM -(APIServer pid=1) INFO 04-22 00:50:06 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 00:50:06 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 00:50:06 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 00:50:06 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 00:50:06 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 00:50:06 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 00:50:06 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 00:50:06 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 00:50:06 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 00:50:06 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:50:06 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:50:06 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 00:50:10 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:50:10 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:50:10 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:50:10 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:50:10 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:50:10 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:50:10 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:50:10 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:50:10 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:50:10 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:50:10 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:50:10 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:50:14 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=244) DEBUG 04-22 00:50:16 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 00:50:16 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=244) DEBUG 04-22 00:50:16 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/e19e2dca-0a9e-477a-b371-19c9091b3f73'], outputs=['ipc:///tmp/4e624f2d-09ca-410a-b00b-42134da1617e'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=244) DEBUG 04-22 00:50:16 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=244) DEBUG 04-22 00:50:16 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=244) DEBUG 04-22 00:50:16 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=244) DEBUG 04-22 00:50:16 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=244) DEBUG 04-22 00:50:16 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=244) INFO 04-22 00:50:16 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='ibm-granite/granite-3.3-8b-instruct', speculative_config=None, tokenizer='ibm-granite/granite-3.3-8b-instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=ibm-granite/granite-3.3-8b-instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=244) DEBUG 04-22 00:50:16 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.131.7.81:39775 backend=nccl -(EngineCore pid=244) INFO 04-22 00:50:16 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.131.7.81:39775 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=244) DEBUG 04-22 00:50:16 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=244) INFO 04-22 00:50:16 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=244) DEBUG 04-22 00:50:17 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776819017.1554534, auto_measure=True -(EngineCore pid=244) DEBUG 04-22 00:50:17 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=244) DEBUG 04-22 00:50:17 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=244) DEBUG 04-22 00:50:17 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=244) DEBUG 04-22 00:50:17 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=244) DEBUG 04-22 00:50:17 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=244) DEBUG 04-22 00:50:17 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=244) DEBUG 04-22 00:50:17 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=244) DEBUG 04-22 00:50:17 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=244) DEBUG 04-22 00:50:17 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=244) INFO 04-22 00:50:17 [v1/worker/gpu_model_runner.py:4735] Starting to load model ibm-granite/granite-3.3-8b-instruct... -(EngineCore pid=244) DEBUG 04-22 00:50:17 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=244) INFO 04-22 00:50:17 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=244) INFO 04-22 00:50:17 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=244) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=244) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=244) DEBUG 04-22 00:50:18 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=244) DEBUG 04-22 00:50:18 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=244) DEBUG 04-22 00:50:18 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 81, 'silu_and_mul': 40, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=244) DEBUG 04-22 00:50:18 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=244) DEBUG 04-22 00:50:18 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00001-of-00004.safetensors']] -(EngineCore pid=244) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 -(APIServer pid=1) DEBUG 04-22 00:50:36 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/granite.py -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=244) INFO 04-22 00:50:37 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/ad49f5c00b/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=e817fe24ec comp=e546579c48 code=9615bc1b6a3cdadf99f40f12188b347f2842282ef159341e824808308c14a2aa dir=/data/.cache/vllm/torch_compile_cache/ad49f5c00b/rank_0_0/backbone -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=244) DEBUG 04-22 00:50:37 [compilation/backends.py:1074] Vllm config hash: e817fe24ec -(EngineCore pid=244) INFO 04-22 00:50:37 [compilation/backends.py:1111] Dynamo bytecode transform time: 5.01 s -(EngineCore pid=244) DEBUG 04-22 00:50:38 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=244) DEBUG 04-22 00:50:38 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=244) DEBUG 04-22 00:50:38 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=244) DEBUG 04-22 00:50:38 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms -(EngineCore pid=244) DEBUG 04-22 00:50:38 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms -(EngineCore pid=244) DEBUG 04-22 00:50:38 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=244) DEBUG 04-22 00:50:38 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=244) INFO 04-22 00:50:40 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=244) DEBUG 04-22 00:50:40 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/ad49f5c00b/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=244) DEBUG 04-22 00:50:40 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=244) DEBUG 04-22 00:50:40 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(EngineCore pid=244) DEBUG 04-22 00:50:40 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms -(EngineCore pid=244) DEBUG 04-22 00:50:40 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=244) DEBUG 04-22 00:50:40 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=244) DEBUG 04-22 00:50:41 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/ad49f5c00b/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=244) DEBUG 04-22 00:50:42 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=244) DEBUG 04-22 00:50:42 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms -(EngineCore pid=244) DEBUG 04-22 00:50:42 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms -(EngineCore pid=244) DEBUG 04-22 00:50:42 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=244) DEBUG 04-22 00:50:42 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(EngineCore pid=244) DEBUG 04-22 00:50:43 [compilation/backends.py:377] Store the 40-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_40', '/data/.cache/vllm/torch_compile_cache/ad49f5c00b/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_40') -(EngineCore pid=244) INFO 04-22 00:50:43 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.66 s -(EngineCore pid=244) DEBUG 04-22 00:50:43 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/ad49f5c00b/rank_0_0/backbone/computation_graph.py -(EngineCore pid=244) INFO 04-22 00:50:44 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/e0255b29bbc50ba3ddb543906d793d5561501aae395c54778798896484ebac2c/rank_0_0/model -(EngineCore pid=244) INFO 04-22 00:50:44 [compilation/monitor.py:48] torch.compile took 11.95 s in total -(EngineCore pid=244) INFO 04-22 00:50:45 [compilation/monitor.py:76] Initial profiling/warmup run took 0.47 s -(APIServer pid=1) DEBUG 04-22 00:50:46 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=244) INFO 04-22 00:50:53 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=244) DEBUG 04-22 00:50:53 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=244) INFO 04-22 00:50:53 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=244) DEBUG 04-22 00:50:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:50:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:50:54 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-22 00:50:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:50:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:50:54 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-22 00:50:54 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 124.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=244) DEBUG 04-22 00:50:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:50:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:50:54 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-22 00:50:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:50:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:50:54 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-22 00:50:54 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 262.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(EngineCore pid=244) DEBUG 04-22 00:50:54 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=244) INFO 04-22 00:50:54 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.74 GiB total -(EngineCore pid=244) DEBUG 04-22 00:50:54 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=244) DEBUG 04-22 00:50:54 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.7 GiB (total), 59.26 GiB (within requested) -(EngineCore pid=244) DEBUG 04-22 00:50:54 [v1/worker/gpu_worker.py:435] Memory profiling takes 22.12 seconds. Total non KV cache memory: 16.57GiB; torch peak memory increase: 0.85GiB; non-torch forward increase memory: 0.46GiB; weights memory: 15.25GiB. -(EngineCore pid=244) INFO 04-22 00:50:54 [v1/worker/gpu_worker.py:436] Available KV cache memory: 58.66 GiB -(EngineCore pid=244) INFO 04-22 00:50:54 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9594 to maintain the same effective KV cache size. -(EngineCore pid=244) INFO 04-22 00:50:54 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 384,432 tokens -(EngineCore pid=244) INFO 04-22 00:50:54 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 46.93x -(EngineCore pid=244) 2026-04-22 00:50:54,971 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=244) DEBUG 04-22 00:50:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) 2026-04-22 00:50:54,983 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=244) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:51:21 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:51:21 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 00:51:21 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:51:21 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 00:51:21 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 00:51:21 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model ibm-granite/granite-vision-3.3-2b -(APIServer pid=1) INFO 04-22 00:51:21 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 00:51:21 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:51:21 [entrypoints/utils.py:233] non-default args: {'model_tag': 'ibm-granite/granite-vision-3.3-2b', 'model': 'ibm-granite/granite-vision-3.3-2b', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 00:51:21 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 00:51:21 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llava_next.LlavaNextForConditionalGeneration from cache -(APIServer pid=1) DEBUG 04-22 00:51:21 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0015486 secs -(APIServer pid=1) INFO 04-22 00:51:21 [config/model.py:549] Resolved architecture: LlavaNextForConditionalGeneration -(APIServer pid=1) INFO 04-22 00:51:21 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 00:51:21 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 00:51:21 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 00:51:21 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 00:51:21 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 00:51:21 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 00:51:21 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 00:51:21 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 00:51:21 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 00:51:21 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:51:22 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:51:22 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. -(APIServer pid=1) DEBUG 04-22 00:51:22 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 00:51:27 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:51:27 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:51:27 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:51:27 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:51:27 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:51:27 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:51:27 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:51:27 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:51:27 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:51:27 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:51:27 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:51:27 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:51:32 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=467) DEBUG 04-22 00:51:34 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 00:51:34 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=467) DEBUG 04-22 00:51:34 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/832edc0f-9891-4845-ba97-32535407db1d'], outputs=['ipc:///tmp/be266a52-4723-4add-a4ab-66d2955016e6'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=467) DEBUG 04-22 00:51:34 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=467) DEBUG 04-22 00:51:34 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=467) DEBUG 04-22 00:51:34 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=467) DEBUG 04-22 00:51:34 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=467) DEBUG 04-22 00:51:34 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=467) INFO 04-22 00:51:34 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='ibm-granite/granite-vision-3.3-2b', speculative_config=None, tokenizer='ibm-granite/granite-vision-3.3-2b', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=ibm-granite/granite-vision-3.3-2b, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=467) DEBUG 04-22 00:51:34 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(EngineCore pid=467) DEBUG 04-22 00:51:35 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.7.32:52891 backend=nccl -(EngineCore pid=467) INFO 04-22 00:51:35 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.7.32:52891 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=467) DEBUG 04-22 00:51:35 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=467) INFO 04-22 00:51:35 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=467) DEBUG 04-22 00:51:35 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776819095.7213402, auto_measure=True -(EngineCore pid=467) DEBUG 04-22 00:51:35 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=467) DEBUG 04-22 00:51:35 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=467) DEBUG 04-22 00:51:35 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=467) DEBUG 04-22 00:51:35 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=467) DEBUG 04-22 00:51:35 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=467) DEBUG 04-22 00:51:35 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=467) DEBUG 04-22 00:51:35 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=467) DEBUG 04-22 00:51:35 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. -(EngineCore pid=467) DEBUG 04-22 00:51:37 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=467) INFO 04-22 00:51:37 [v1/worker/gpu_model_runner.py:4735] Starting to load model ibm-granite/granite-vision-3.3-2b... -(EngineCore pid=467) INFO 04-22 00:51:37 [model_executor/models/interfaces.py:171] Contains out of vocabulary multimodal tokens? False -(EngineCore pid=467) INFO 04-22 00:51:37 [platforms/cuda.py:390] Using backend AttentionBackendEnum.FLASH_ATTN for vit attention -(EngineCore pid=467) INFO 04-22 00:51:37 [model_executor/.../attention/mm_encoder_attention.py:230] Using AttentionBackendEnum.FLASH_ATTN for MMEncoderAttention. -(EngineCore pid=467) INFO 04-22 00:51:37 [config/vllm.py:790] Asynchronous scheduling is enabled. -(EngineCore pid=467) DEBUG 04-22 00:51:37 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=467) DEBUG 04-22 00:51:38 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=64, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=467) INFO 04-22 00:51:38 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=467) INFO 04-22 00:51:38 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=467) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=467) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=467) DEBUG 04-22 00:51:38 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=467) DEBUG 04-22 00:51:38 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 27}) -(EngineCore pid=467) DEBUG 04-22 00:51:38 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 81, 'silu_and_mul': 40, 'conv2d': 1, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=467) DEBUG 04-22 00:51:38 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 27}) -(EngineCore pid=467) DEBUG 04-22 00:51:38 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 81, 'silu_and_mul': 40, 'conv2d': 1, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=467) DEBUG 04-22 00:51:38 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=467) DEBUG 04-22 00:51:38 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00001-of-00002.safetensors', 'model-00002-of-00002.safetensors']] -(EngineCore pid=467) Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00 -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/granite.py -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=467) INFO 04-22 00:51:49 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/56a7cc408f/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=107570cfa8 comp=e546579c48 code=bf166f325866070c057071cfb4009752565a89cce07da824263e5292c4847928 dir=/data/.cache/vllm/torch_compile_cache/56a7cc408f/rank_0_0/backbone -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/backends.py:1074] Vllm config hash: 107570cfa8 -(EngineCore pid=467) INFO 04-22 00:51:49 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.91 s -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=467) DEBUG 04-22 00:51:49 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=467) DEBUG 04-22 00:51:50 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=467) DEBUG 04-22 00:51:50 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms -(EngineCore pid=467) DEBUG 04-22 00:51:50 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.7 ms -(EngineCore pid=467) DEBUG 04-22 00:51:50 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=467) DEBUG 04-22 00:51:50 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=467) INFO 04-22 00:51:51 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=467) DEBUG 04-22 00:51:51 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/56a7cc408f/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=467) DEBUG 04-22 00:51:52 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=467) DEBUG 04-22 00:51:52 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(EngineCore pid=467) DEBUG 04-22 00:51:52 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.0 ms -(EngineCore pid=467) DEBUG 04-22 00:51:52 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=467) DEBUG 04-22 00:51:52 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=467) DEBUG 04-22 00:51:53 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/56a7cc408f/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(APIServer pid=1) DEBUG 04-22 00:51:54 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=467) DEBUG 04-22 00:51:54 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=467) DEBUG 04-22 00:51:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms -(EngineCore pid=467) DEBUG 04-22 00:51:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms -(EngineCore pid=467) DEBUG 04-22 00:51:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=467) DEBUG 04-22 00:51:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(EngineCore pid=467) DEBUG 04-22 00:51:54 [compilation/backends.py:377] Store the 40-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_40', '/data/.cache/vllm/torch_compile_cache/56a7cc408f/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_40') -(EngineCore pid=467) INFO 04-22 00:51:54 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.13 s -(EngineCore pid=467) DEBUG 04-22 00:51:54 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/56a7cc408f/rank_0_0/backbone/computation_graph.py -(EngineCore pid=467) INFO 04-22 00:51:56 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/6b5b3eda07b021ffbc2d42a757d2f8d9e2ac125e9658c69b9579ea1352bc8d9a/rank_0_0/model -(EngineCore pid=467) INFO 04-22 00:51:56 [compilation/monitor.py:48] torch.compile took 11.70 s in total -(EngineCore pid=467) INFO 04-22 00:51:56 [compilation/monitor.py:76] Initial profiling/warmup run took 0.10 s -(EngineCore pid=467) INFO 04-22 00:52:01 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=467) DEBUG 04-22 00:52:01 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=467) INFO 04-22 00:52:01 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=467) DEBUG 04-22 00:52:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=467) DEBUG 04-22 00:52:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=467) DEBUG 04-22 00:52:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=467) DEBUG 04-22 00:52:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=467) DEBUG 04-22 00:52:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=467) DEBUG 04-22 00:52:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=467) DEBUG 04-22 00:52:02 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 100.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=467) DEBUG 04-22 00:52:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=467) DEBUG 04-22 00:52:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=467) DEBUG 04-22 00:52:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=467) DEBUG 04-22 00:52:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=467) DEBUG 04-22 00:52:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=467) DEBUG 04-22 00:52:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=467) DEBUG 04-22 00:52:02 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 134.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(EngineCore pid=467) DEBUG 04-22 00:52:02 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=467) INFO 04-22 00:52:02 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.62 GiB total -(EngineCore pid=467) DEBUG 04-22 00:52:03 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=467) DEBUG 04-22 00:52:03 [v1/worker/gpu_worker.py:430] Free memory after profiling: 72.63 GiB (total), 69.18 GiB (within requested) -(EngineCore pid=467) DEBUG 04-22 00:52:03 [v1/worker/gpu_worker.py:435] Memory profiling takes 18.74 seconds. Total non KV cache memory: 6.58GiB; torch peak memory increase: 0.79GiB; non-torch forward increase memory: 0.25GiB; weights memory: 5.54GiB. -(EngineCore pid=467) INFO 04-22 00:52:03 [v1/worker/gpu_worker.py:436] Available KV cache memory: 68.65 GiB -(EngineCore pid=467) INFO 04-22 00:52:03 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9578 to maintain the same effective KV cache size. -(EngineCore pid=467) INFO 04-22 00:52:03 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 899,792 tokens -(EngineCore pid=467) INFO 04-22 00:52:03 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 109.84x -(EngineCore pid=467) 2026-04-22 00:52:03,056 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=467) DEBUG 04-22 00:52:03 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=467) 2026-04-22 00:52:03,068 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=467) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:23:30 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:23:30 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 00:23:30 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:23:30 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 00:23:30 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 00:23:30 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-3.1-8B-Instruct -(APIServer pid=1) INFO 04-22 00:23:30 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 00:23:30 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:23:30 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-3.1-8B-Instruct', 'model': 'meta-llama/Llama-3.1-8B-Instruct', 'dtype': 'bfloat16', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'kv_cache_dtype': 'fp8', 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 00:23:30 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 00:23:31 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 00:23:31 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003864 secs -(APIServer pid=1) INFO 04-22 00:23:31 [config/model.py:549] Resolved architecture: LlamaForCausalLM -(APIServer pid=1) INFO 04-22 00:23:31 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 00:23:31 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 00:23:31 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 00:23:31 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 00:23:31 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 00:23:31 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 00:23:31 [config/cache.py:227] Using fp8 data type to store kv cache. It reduces the GPU memory footprint and boosts the performance. Meanwhile, it may cause accuracy drop without a proper scaling factor. -(APIServer pid=1) INFO 04-22 00:23:31 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 00:23:31 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 00:23:31 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 00:23:31 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:23:31 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:23:31 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 00:23:35 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:23:35 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:23:35 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:23:35 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:23:35 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:23:35 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:23:35 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:23:35 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:23:35 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:23:35 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:23:35 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:23:35 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:23:40 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=241) DEBUG 04-22 00:23:41 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 00:23:41 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=241) DEBUG 04-22 00:23:41 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/ee17de87-bccf-49f5-a945-1c7fdda0e114'], outputs=['ipc:///tmp/19d96faf-9b91-482b-9350-d3006f87dc8b'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=241) DEBUG 04-22 00:23:41 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=241) DEBUG 04-22 00:23:41 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=241) DEBUG 04-22 00:23:41 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=241) DEBUG 04-22 00:23:41 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=241) DEBUG 04-22 00:23:41 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=241) INFO 04-22 00:23:41 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=fp8, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=241) DEBUG 04-22 00:23:42 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.4.193:53205 backend=nccl -(EngineCore pid=241) INFO 04-22 00:23:42 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.4.193:53205 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=241) DEBUG 04-22 00:23:42 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=241) INFO 04-22 00:23:42 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=241) DEBUG 04-22 00:23:42 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776817422.4285054, auto_measure=True -(EngineCore pid=241) DEBUG 04-22 00:23:42 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=241) DEBUG 04-22 00:23:42 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=241) DEBUG 04-22 00:23:42 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=241) DEBUG 04-22 00:23:42 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=241) DEBUG 04-22 00:23:42 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=241) DEBUG 04-22 00:23:42 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=241) DEBUG 04-22 00:23:42 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=241) DEBUG 04-22 00:23:42 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=241) INFO 04-22 00:23:42 [v1/worker/gpu_model_runner.py:4735] Starting to load model meta-llama/Llama-3.1-8B-Instruct... -(EngineCore pid=241) DEBUG 04-22 00:23:43 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=fp8, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {FLEX_ATTENTION: [kv_cache_dtype not supported]}. -(EngineCore pid=241) INFO 04-22 00:23:43 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN']. -(EngineCore pid=241) INFO 04-22 00:23:43 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=241) INFO 04-22 00:23:43 [utils/deep_gemm.py:115] DeepGEMM E8M0 enabled on current platform. -(EngineCore pid=241) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=241) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=241) DEBUG 04-22 00:23:43 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=241) DEBUG 04-22 00:23:43 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=241) DEBUG 04-22 00:23:43 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'quant_fp8': 32, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=241) DEBUG 04-22 00:23:43 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=241) DEBUG 04-22 00:23:43 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00004-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'model-00002-of-00004.safetensors']] -(EngineCore pid=241) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 -(APIServer pid=1) DEBUG 04-22 00:23:51 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/input_quant_fp8.py -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/utils/quant_utils.py -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=241) INFO 04-22 00:23:54 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/16bd0cf229/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=a3a40b90b5 comp=e546579c48 code=a481fe428ff1d132f00a2eb629698322104288cdeebd6384595457f75f95534e dir=/data/.cache/vllm/torch_compile_cache/16bd0cf229/rank_0_0/backbone -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=241) DEBUG 04-22 00:23:54 [compilation/backends.py:1074] Vllm config hash: a3a40b90b5 -(EngineCore pid=241) INFO 04-22 00:23:54 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.56 s -(EngineCore pid=241) DEBUG 04-22 00:23:55 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=241) DEBUG 04-22 00:23:55 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=241) DEBUG 04-22 00:23:55 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=241) DEBUG 04-22 00:23:55 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms -(EngineCore pid=241) DEBUG 04-22 00:23:55 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms -(EngineCore pid=241) DEBUG 04-22 00:23:55 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=241) DEBUG 04-22 00:23:55 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=241) INFO 04-22 00:23:57 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=241) DEBUG 04-22 00:23:57 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/16bd0cf229/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=241) DEBUG 04-22 00:23:57 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=241) DEBUG 04-22 00:23:57 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(EngineCore pid=241) DEBUG 04-22 00:23:57 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.2 ms -(EngineCore pid=241) DEBUG 04-22 00:23:57 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=241) DEBUG 04-22 00:23:57 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=241) DEBUG 04-22 00:23:59 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/16bd0cf229/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=241) DEBUG 04-22 00:24:00 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=241) DEBUG 04-22 00:24:00 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms -(EngineCore pid=241) DEBUG 04-22 00:24:00 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms -(EngineCore pid=241) DEBUG 04-22 00:24:00 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=241) DEBUG 04-22 00:24:00 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(EngineCore pid=241) DEBUG 04-22 00:24:00 [compilation/backends.py:377] Store the 32-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_32', '/data/.cache/vllm/torch_compile_cache/16bd0cf229/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_32') -(EngineCore pid=241) INFO 04-22 00:24:00 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.35 s -(EngineCore pid=241) DEBUG 04-22 00:24:00 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/16bd0cf229/rank_0_0/backbone/computation_graph.py -(APIServer pid=1) DEBUG 04-22 00:24:01 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=241) INFO 04-22 00:24:01 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/fadd5d0dcb1cecef36f6290c55d85692abb624cc2fa3646ab99c523f39738c6c/rank_0_0/model -(EngineCore pid=241) INFO 04-22 00:24:01 [compilation/monitor.py:48] torch.compile took 11.42 s in total -(EngineCore pid=241) INFO 04-22 00:24:02 [compilation/monitor.py:76] Initial profiling/warmup run took 0.45 s -(EngineCore pid=241) INFO 04-22 00:24:07 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=241) DEBUG 04-22 00:24:07 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=241) INFO 04-22 00:24:07 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=241) DEBUG 04-22 00:24:07 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=241) DEBUG 04-22 00:24:07 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=241) DEBUG 04-22 00:24:07 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=241) DEBUG 04-22 00:24:07 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=241) DEBUG 04-22 00:24:07 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=241) DEBUG 04-22 00:24:07 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=241) DEBUG 04-22 00:24:07 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 124.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=241) DEBUG 04-22 00:24:07 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=241) DEBUG 04-22 00:24:07 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=241) DEBUG 04-22 00:24:07 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=241) DEBUG 04-22 00:24:07 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=241) DEBUG 04-22 00:24:07 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=241) DEBUG 04-22 00:24:07 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=241) DEBUG 04-22 00:24:07 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 260.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=241) DEBUG 04-22 00:24:08 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=241) INFO 04-22 00:24:08 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.84 GiB total -(EngineCore pid=241) DEBUG 04-22 00:24:08 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=241) DEBUG 04-22 00:24:08 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.24 GiB (total), 58.79 GiB (within requested) -(EngineCore pid=241) DEBUG 04-22 00:24:08 [v1/worker/gpu_worker.py:435] Memory profiling takes 18.33 seconds. Total non KV cache memory: 17.12GiB; torch peak memory increase: 1.89GiB; non-torch forward increase memory: 0.25GiB; weights memory: 14.99GiB. -(EngineCore pid=241) INFO 04-22 00:24:08 [v1/worker/gpu_worker.py:436] Available KV cache memory: 58.11 GiB -(EngineCore pid=241) INFO 04-22 00:24:08 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9606 to maintain the same effective KV cache size. -(EngineCore pid=241) INFO 04-22 00:24:08 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 952,032 tokens -(EngineCore pid=241) INFO 04-22 00:24:08 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 116.21x -(EngineCore pid=241) 2026-04-22 00:24:08,713 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=241) DEBUG 04-22 00:24:08 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=241) 2026-04-22 00:24:08,721 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=241) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:06:59 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:06:59 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 00:06:59 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:06:59 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 00:06:59 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 00:06:59 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-3.1-8B-Instruct -(APIServer pid=1) INFO 04-22 00:06:59 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 00:06:59 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:06:59 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-3.1-8B-Instruct', 'model': 'meta-llama/Llama-3.1-8B-Instruct', 'dtype': 'bfloat16', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 00:06:59 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 00:07:00 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 00:07:00 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003826 secs -(APIServer pid=1) INFO 04-22 00:07:00 [config/model.py:549] Resolved architecture: LlamaForCausalLM -(APIServer pid=1) INFO 04-22 00:07:00 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 00:07:00 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 00:07:00 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 00:07:00 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 00:07:00 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 00:07:00 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 00:07:00 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 00:07:00 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 00:07:00 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 00:07:00 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:07:00 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:07:00 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 00:07:04 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:07:04 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:07:04 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:07:04 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:07:04 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:07:04 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:07:04 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:07:04 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:07:04 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:07:04 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:07:04 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:07:04 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:07:08 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=244) DEBUG 04-22 00:07:10 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 00:07:10 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=244) DEBUG 04-22 00:07:10 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/1660a71c-bad1-4fd1-97d6-d5652f5ff393'], outputs=['ipc:///tmp/c79bb2ea-2ffb-494e-95e5-68061e6538d9'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=244) DEBUG 04-22 00:07:10 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=244) DEBUG 04-22 00:07:10 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=244) DEBUG 04-22 00:07:10 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=244) DEBUG 04-22 00:07:10 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=244) DEBUG 04-22 00:07:10 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=244) INFO 04-22 00:07:10 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=244) DEBUG 04-22 00:07:10 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.4.187:47257 backend=nccl -(EngineCore pid=244) INFO 04-22 00:07:10 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.4.187:47257 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=244) DEBUG 04-22 00:07:10 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=244) INFO 04-22 00:07:10 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=244) DEBUG 04-22 00:07:11 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776816431.302875, auto_measure=True -(EngineCore pid=244) DEBUG 04-22 00:07:11 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=244) DEBUG 04-22 00:07:11 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=244) DEBUG 04-22 00:07:11 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=244) DEBUG 04-22 00:07:11 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=244) DEBUG 04-22 00:07:11 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=244) DEBUG 04-22 00:07:11 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=244) DEBUG 04-22 00:07:11 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=244) DEBUG 04-22 00:07:11 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=244) INFO 04-22 00:07:11 [v1/worker/gpu_model_runner.py:4735] Starting to load model meta-llama/Llama-3.1-8B-Instruct... -(EngineCore pid=244) DEBUG 04-22 00:07:12 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=244) INFO 04-22 00:07:12 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=244) INFO 04-22 00:07:12 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=244) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=244) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=244) DEBUG 04-22 00:07:12 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=244) DEBUG 04-22 00:07:12 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=244) DEBUG 04-22 00:07:12 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=244) DEBUG 04-22 00:07:12 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=244) DEBUG 04-22 00:07:12 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00004-of-00004.safetensors', 'model-00002-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00001-of-00004.safetensors']] -(EngineCore pid=244) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:54:08 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:54:08 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 01:54:08 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:54:08 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 01:54:08 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 01:54:08 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-3.1-8B-Instruct -(APIServer pid=1) INFO 04-22 01:54:08 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 01:54:08 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:54:08 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-3.1-8B-Instruct', 'model': 'meta-llama/Llama-3.1-8B-Instruct', 'max_model_len': 8192, 'quantization': 'fp8', 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 01:54:08 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 01:54:08 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 01:54:08 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0024651 secs -(APIServer pid=1) INFO 04-22 01:54:08 [config/model.py:549] Resolved architecture: LlamaForCausalLM -(APIServer pid=1) INFO 04-22 01:54:08 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 01:54:08 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 01:54:08 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 01:54:08 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 01:54:08 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 01:54:08 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 01:54:08 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 01:54:09 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 01:54:09 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 01:54:09 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:54:09 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:54:09 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 01:54:13 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:54:13 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:54:13 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:54:13 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:54:13 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:54:13 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:54:13 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:54:13 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:54:13 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:54:13 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:54:13 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:54:13 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:54:18 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=245) DEBUG 04-22 01:54:19 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 01:54:19 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=245) DEBUG 04-22 01:54:19 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/dee96d97-f77e-444a-b32d-83078a8149a7'], outputs=['ipc:///tmp/0b541461-0bd5-4e24-b152-548bdd8dc3ad'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=245) DEBUG 04-22 01:54:19 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=245) DEBUG 04-22 01:54:19 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=245) DEBUG 04-22 01:54:19 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=245) DEBUG 04-22 01:54:19 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=245) DEBUG 04-22 01:54:19 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=245) INFO 04-22 01:54:19 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=fp8, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=245) DEBUG 04-22 01:54:20 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.43:46413 backend=nccl -(EngineCore pid=245) INFO 04-22 01:54:20 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.43:46413 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=245) DEBUG 04-22 01:54:20 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=245) INFO 04-22 01:54:20 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=245) DEBUG 04-22 01:54:20 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776822860.681496, auto_measure=True -(EngineCore pid=245) DEBUG 04-22 01:54:20 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=245) DEBUG 04-22 01:54:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=245) DEBUG 04-22 01:54:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=245) DEBUG 04-22 01:54:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=245) DEBUG 04-22 01:54:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=245) DEBUG 04-22 01:54:20 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=245) DEBUG 04-22 01:54:20 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=245) DEBUG 04-22 01:54:20 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=245) INFO 04-22 01:54:20 [v1/worker/gpu_model_runner.py:4735] Starting to load model meta-llama/Llama-3.1-8B-Instruct... -(EngineCore pid=245) INFO 04-22 01:54:21 [model_executor/.../linear/__init__.py:261] Selected CutlassFP8ScaledMMLinearKernel for Fp8OnlineLinearMethod -(EngineCore pid=245) INFO 04-22 01:54:21 [utils/deep_gemm.py:115] DeepGEMM E8M0 enabled on current platform. -(EngineCore pid=245) DEBUG 04-22 01:54:21 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=245) INFO 04-22 01:54:21 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=245) INFO 04-22 01:54:21 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=245) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=245) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=245) DEBUG 04-22 01:54:21 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=245) DEBUG 04-22 01:54:21 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=245) DEBUG 04-22 01:54:21 [config/compilation.py:1195] disabled custom ops: Counter({'quant_fp8': 128, 'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=245) DEBUG 04-22 01:54:21 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=245) DEBUG 04-22 01:54:22 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00004-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00002-of-00004.safetensors', 'model-00001-of-00004.safetensors']] -(EngineCore pid=245) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 -(APIServer pid=1) DEBUG 04-22 01:54:39 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/fp8.py -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/input_quant_fp8.py -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/utils/quant_utils.py -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=245) INFO 04-22 01:54:42 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/b8b812d5c5/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=a909408016 comp=e546579c48 code=5379d6d5677d7cdb4c84e4840e951dc1c2c3978435fcf06c17411f78c54e9030 dir=/data/.cache/vllm/torch_compile_cache/b8b812d5c5/rank_0_0/backbone -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/backends.py:1074] Vllm config hash: a909408016 -(EngineCore pid=245) INFO 04-22 01:54:42 [compilation/backends.py:1111] Dynamo bytecode transform time: 7.27 s -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=245) DEBUG 04-22 01:54:42 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=245) DEBUG 04-22 01:54:43 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=245) DEBUG 04-22 01:54:43 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(EngineCore pid=245) DEBUG 04-22 01:54:43 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.3 ms -(EngineCore pid=245) DEBUG 04-22 01:54:43 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=245) DEBUG 04-22 01:54:43 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=245) INFO 04-22 01:54:45 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=245) DEBUG 04-22 01:54:45 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/b8b812d5c5/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=245) DEBUG 04-22 01:54:46 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=245) DEBUG 04-22 01:54:46 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(EngineCore pid=245) DEBUG 04-22 01:54:46 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.9 ms -(EngineCore pid=245) DEBUG 04-22 01:54:46 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=245) DEBUG 04-22 01:54:46 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(EngineCore pid=245) DEBUG 04-22 01:54:48 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/b8b812d5c5/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=245) DEBUG 04-22 01:54:49 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=245) DEBUG 04-22 01:54:49 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.4 ms -(EngineCore pid=245) DEBUG 04-22 01:54:49 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.7 ms -(EngineCore pid=245) DEBUG 04-22 01:54:49 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=245) DEBUG 04-22 01:54:49 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(APIServer pid=1) DEBUG 04-22 01:54:49 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=245) DEBUG 04-22 01:54:49 [compilation/backends.py:377] Store the 32-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_32', '/data/.cache/vllm/torch_compile_cache/b8b812d5c5/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_32') -(EngineCore pid=245) INFO 04-22 01:54:49 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 7.36 s -(EngineCore pid=245) DEBUG 04-22 01:54:50 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/b8b812d5c5/rank_0_0/backbone/computation_graph.py -(EngineCore pid=245) INFO 04-22 01:54:52 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/b93efa87a050be75284475ae3f54704fbf7b8794a49f695fb7f3527dd72f6157/rank_0_0/model -(EngineCore pid=245) INFO 04-22 01:54:52 [compilation/monitor.py:48] torch.compile took 17.02 s in total -(EngineCore pid=245) INFO 04-22 01:54:52 [compilation/monitor.py:76] Initial profiling/warmup run took 0.32 s -(EngineCore pid=245) INFO 04-22 01:54:57 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=245) DEBUG 04-22 01:54:57 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=245) INFO 04-22 01:54:57 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=245) DEBUG 04-22 01:54:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=245) DEBUG 04-22 01:54:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=245) DEBUG 04-22 01:54:58 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=245) DEBUG 04-22 01:54:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=245) DEBUG 04-22 01:54:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=245) DEBUG 04-22 01:54:58 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=245) DEBUG 04-22 01:54:58 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 98.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=245) DEBUG 04-22 01:54:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=245) DEBUG 04-22 01:54:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=245) DEBUG 04-22 01:54:58 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=245) DEBUG 04-22 01:54:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=245) DEBUG 04-22 01:54:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=245) DEBUG 04-22 01:54:58 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=245) DEBUG 04-22 01:54:58 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 260.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=245) DEBUG 04-22 01:54:59 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=245) INFO 04-22 01:54:59 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.84 GiB total -(EngineCore pid=245) DEBUG 04-22 01:54:59 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=245) DEBUG 04-22 01:54:59 [v1/worker/gpu_worker.py:430] Free memory after profiling: 69.57 GiB (total), 66.12 GiB (within requested) -(EngineCore pid=245) DEBUG 04-22 01:54:59 [v1/worker/gpu_worker.py:435] Memory profiling takes 24.38 seconds. Total non KV cache memory: 10.62GiB; torch peak memory increase: 1.89GiB; non-torch forward increase memory: 0.25GiB; weights memory: 8.49GiB. -(EngineCore pid=245) INFO 04-22 01:54:59 [v1/worker/gpu_worker.py:436] Available KV cache memory: 64.61 GiB -(EngineCore pid=245) INFO 04-22 01:54:59 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9606 to maintain the same effective KV cache size. -(EngineCore pid=245) INFO 04-22 01:54:59 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 529,248 tokens -(EngineCore pid=245) INFO 04-22 01:54:59 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 64.61x -(EngineCore pid=245) 2026-04-22 01:54:59,423 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=245) DEBUG 04-22 01:54:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=245) 2026-04-22 01:54:59,431 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=245) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:05:59 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:05:59 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 00:05:59 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:05:59 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 00:05:59 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 00:05:59 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-3.1-8B-Instruct -(APIServer pid=1) INFO 04-22 00:05:59 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 00:05:59 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:05:59 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-3.1-8B-Instruct', 'model': 'meta-llama/Llama-3.1-8B-Instruct', 'dtype': 'float16', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 00:05:59 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 00:05:59 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 00:05:59 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003684 secs -(APIServer pid=1) INFO 04-22 00:05:59 [config/model.py:549] Resolved architecture: LlamaForCausalLM -(APIServer pid=1) WARNING 04-22 00:05:59 [config/model.py:2016] Casting torch.bfloat16 to torch.float16. -(APIServer pid=1) INFO 04-22 00:05:59 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 00:05:59 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 00:05:59 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 00:05:59 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 00:05:59 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 00:05:59 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 00:05:59 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 00:05:59 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 00:05:59 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 00:05:59 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:06:00 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:06:00 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 00:06:03 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:06:03 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:06:03 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:06:03 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:06:03 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:06:03 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:06:03 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:06:03 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:06:03 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:06:03 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:06:03 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:06:03 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:06:08 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=243) DEBUG 04-22 00:06:10 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 00:06:10 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=243) DEBUG 04-22 00:06:10 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/fca1389a-59c4-4e9b-a75c-f1faad6c104b'], outputs=['ipc:///tmp/c00a72ab-c547-4d85-92e6-2c8582029bd6'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=243) DEBUG 04-22 00:06:10 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=243) DEBUG 04-22 00:06:10 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=243) DEBUG 04-22 00:06:10 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=243) DEBUG 04-22 00:06:10 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=243) DEBUG 04-22 00:06:10 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=243) INFO 04-22 00:06:10 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=243) DEBUG 04-22 00:06:10 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.4.186:55885 backend=nccl -(EngineCore pid=243) INFO 04-22 00:06:10 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.4.186:55885 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=243) DEBUG 04-22 00:06:10 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=243) INFO 04-22 00:06:10 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=243) DEBUG 04-22 00:06:11 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776816371.008869, auto_measure=True -(EngineCore pid=243) DEBUG 04-22 00:06:11 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=243) DEBUG 04-22 00:06:11 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=243) DEBUG 04-22 00:06:11 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=243) DEBUG 04-22 00:06:11 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=243) DEBUG 04-22 00:06:11 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=243) DEBUG 04-22 00:06:11 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=243) DEBUG 04-22 00:06:11 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=243) DEBUG 04-22 00:06:11 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=243) INFO 04-22 00:06:11 [v1/worker/gpu_model_runner.py:4735] Starting to load model meta-llama/Llama-3.1-8B-Instruct... -(EngineCore pid=243) DEBUG 04-22 00:06:11 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.float16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=243) INFO 04-22 00:06:11 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=243) INFO 04-22 00:06:11 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=243) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=243) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=243) DEBUG 04-22 00:06:11 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=243) DEBUG 04-22 00:06:11 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=243) DEBUG 04-22 00:06:11 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=243) DEBUG 04-22 00:06:11 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=243) DEBUG 04-22 00:06:12 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00003-of-00004.safetensors']] -(EngineCore pid=243) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 -(APIServer pid=1) DEBUG 04-22 00:06:20 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=243) INFO 04-22 00:06:20 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/739e065bf0/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=fd6c2a97a9 comp=e546579c48 code=d4902a9f99bbc2392c45e4b4dc801f4424a11306edb95a61851b1062db20b8c4 dir=/data/.cache/vllm/torch_compile_cache/739e065bf0/rank_0_0/backbone -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/backends.py:1074] Vllm config hash: fd6c2a97a9 -(EngineCore pid=243) INFO 04-22 00:06:20 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.22 s -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=243) DEBUG 04-22 00:06:20 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=243) DEBUG 04-22 00:06:21 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 00:06:21 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms -(EngineCore pid=243) DEBUG 04-22 00:06:21 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms -(EngineCore pid=243) DEBUG 04-22 00:06:21 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 00:06:21 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=243) INFO 04-22 00:06:23 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=243) DEBUG 04-22 00:06:23 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/739e065bf0/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=243) DEBUG 04-22 00:06:23 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 00:06:23 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(EngineCore pid=243) DEBUG 04-22 00:06:23 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.2 ms -(EngineCore pid=243) DEBUG 04-22 00:06:23 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 00:06:23 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=243) DEBUG 04-22 00:06:24 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/739e065bf0/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=243) DEBUG 04-22 00:06:25 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 00:06:25 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms -(EngineCore pid=243) DEBUG 04-22 00:06:25 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms -(EngineCore pid=243) DEBUG 04-22 00:06:25 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 00:06:25 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(EngineCore pid=243) DEBUG 04-22 00:06:26 [compilation/backends.py:377] Store the 32-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_32', '/data/.cache/vllm/torch_compile_cache/739e065bf0/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_32') -(EngineCore pid=243) INFO 04-22 00:06:26 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.04 s -(EngineCore pid=243) DEBUG 04-22 00:06:26 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/739e065bf0/rank_0_0/backbone/computation_graph.py -(EngineCore pid=243) INFO 04-22 00:06:27 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/8a96897ca6e1860a8e6b9000115979b0b0ec581428214eb12ff34604a061c971/rank_0_0/model -(EngineCore pid=243) INFO 04-22 00:06:27 [compilation/monitor.py:48] torch.compile took 10.68 s in total -(EngineCore pid=243) INFO 04-22 00:06:27 [compilation/monitor.py:76] Initial profiling/warmup run took 0.48 s -(APIServer pid=1) DEBUG 04-22 00:06:30 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=243) INFO 04-22 00:06:33 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=243) DEBUG 04-22 00:06:33 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=243) INFO 04-22 00:06:33 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=243) DEBUG 04-22 00:06:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:06:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:06:33 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 00:06:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:06:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:06:33 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 00:06:33 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 138.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=243) DEBUG 04-22 00:06:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:06:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:06:33 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 00:06:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:06:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:06:33 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 00:06:33 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 260.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=243) DEBUG 04-22 00:06:34 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=243) INFO 04-22 00:06:34 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.84 GiB total -(EngineCore pid=243) DEBUG 04-22 00:06:34 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=243) DEBUG 04-22 00:06:34 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.24 GiB (total), 58.79 GiB (within requested) -(EngineCore pid=243) DEBUG 04-22 00:06:34 [v1/worker/gpu_worker.py:435] Memory profiling takes 17.70 seconds. Total non KV cache memory: 17.12GiB; torch peak memory increase: 1.89GiB; non-torch forward increase memory: 0.25GiB; weights memory: 14.99GiB. -(EngineCore pid=243) INFO 04-22 00:06:34 [v1/worker/gpu_worker.py:436] Available KV cache memory: 58.11 GiB -(EngineCore pid=243) INFO 04-22 00:06:34 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9606 to maintain the same effective KV cache size. -(EngineCore pid=243) INFO 04-22 00:06:34 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 476,016 tokens -(EngineCore pid=243) INFO 04-22 00:06:34 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 58.11x -(EngineCore pid=243) 2026-04-22 00:06:34,298 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=243) DEBUG 04-22 00:06:34 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) 2026-04-22 00:06:34,307 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=243) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:08:14 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:08:14 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 00:08:14 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:08:14 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 00:08:14 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 00:08:14 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-3.1-8B-Instruct -(APIServer pid=1) INFO 04-22 00:08:14 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 00:08:14 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:08:14 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-3.1-8B-Instruct', 'model': 'meta-llama/Llama-3.1-8B-Instruct', 'dtype': 'float32', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 00:08:14 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 00:08:15 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 00:08:15 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0031874 secs -(APIServer pid=1) INFO 04-22 00:08:15 [config/model.py:549] Resolved architecture: LlamaForCausalLM -(APIServer pid=1) INFO 04-22 00:08:15 [config/model.py:2010] Upcasting torch.bfloat16 to torch.float32. -(APIServer pid=1) INFO 04-22 00:08:15 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 00:08:15 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 00:08:15 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 00:08:15 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 00:08:15 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 00:08:15 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 00:08:15 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 00:08:15 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 00:08:15 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 00:08:15 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:08:15 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:08:15 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 00:08:19 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:08:19 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:08:19 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:08:19 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:08:19 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:08:19 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:08:19 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:08:19 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:08:19 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:08:19 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:08:19 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:08:19 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:08:24 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=244) DEBUG 04-22 00:08:25 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 00:08:25 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=244) DEBUG 04-22 00:08:25 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/1fb5a54e-3c4f-4104-a23e-0494dac61c29'], outputs=['ipc:///tmp/208aeb77-47c0-41a4-960c-7a979382f661'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=244) DEBUG 04-22 00:08:25 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=244) DEBUG 04-22 00:08:25 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=244) DEBUG 04-22 00:08:25 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=244) DEBUG 04-22 00:08:25 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=244) DEBUG 04-22 00:08:25 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=244) INFO 04-22 00:08:25 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float32, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=244) DEBUG 04-22 00:08:26 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.23:60703 backend=nccl -(EngineCore pid=244) INFO 04-22 00:08:26 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.23:60703 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=244) DEBUG 04-22 00:08:26 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=244) INFO 04-22 00:08:26 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=244) DEBUG 04-22 00:08:26 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776816506.7239153, auto_measure=True -(EngineCore pid=244) DEBUG 04-22 00:08:26 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=244) DEBUG 04-22 00:08:26 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=244) DEBUG 04-22 00:08:26 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=244) DEBUG 04-22 00:08:26 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=244) DEBUG 04-22 00:08:26 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=244) DEBUG 04-22 00:08:26 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=244) DEBUG 04-22 00:08:26 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=244) DEBUG 04-22 00:08:26 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=244) INFO 04-22 00:08:26 [v1/worker/gpu_model_runner.py:4735] Starting to load model meta-llama/Llama-3.1-8B-Instruct... -(EngineCore pid=244) DEBUG 04-22 00:08:27 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.float32, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {FLASH_ATTN: [dtype not supported], FLASHINFER: [dtype not supported]}. -(EngineCore pid=244) INFO 04-22 00:08:27 [platforms/cuda.py:334] Using TRITON_ATTN attention backend out of potential backends: ['TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=244) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=244) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=244) DEBUG 04-22 00:08:27 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=244) DEBUG 04-22 00:08:27 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=244) DEBUG 04-22 00:08:27 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=244) DEBUG 04-22 00:08:27 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=244) DEBUG 04-22 00:08:27 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00004-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00002-of-00004.safetensors']] -(EngineCore pid=244) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=244) INFO 04-22 00:08:43 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/9bfd138913/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=78754e407e comp=e546579c48 code=d4902a9f99bbc2392c45e4b4dc801f4424a11306edb95a61851b1062db20b8c4 dir=/data/.cache/vllm/torch_compile_cache/9bfd138913/rank_0_0/backbone -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=244) DEBUG 04-22 00:08:43 [compilation/backends.py:1074] Vllm config hash: 78754e407e -(EngineCore pid=244) INFO 04-22 00:08:43 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.25 s -(EngineCore pid=244) DEBUG 04-22 00:08:44 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=244) DEBUG 04-22 00:08:44 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=244) /usr/local/lib/python3.12/dist-packages/torch/_inductor/compile_fx.py:321: UserWarning: TensorFloat32 tensor cores for float32 matrix multiplication available but not enabled. Consider setting `torch.set_float32_matmul_precision('high')` for better performance. -(EngineCore pid=244) warnings.warn( -(EngineCore pid=244) DEBUG 04-22 00:08:44 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=244) DEBUG 04-22 00:08:44 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms -(EngineCore pid=244) DEBUG 04-22 00:08:44 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms -(EngineCore pid=244) DEBUG 04-22 00:08:44 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=244) DEBUG 04-22 00:08:44 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(APIServer pid=1) DEBUG 04-22 00:08:45 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=244) INFO 04-22 00:08:46 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=244) DEBUG 04-22 00:08:46 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/9bfd138913/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=244) DEBUG 04-22 00:08:46 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=244) DEBUG 04-22 00:08:46 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(EngineCore pid=244) DEBUG 04-22 00:08:46 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms -(EngineCore pid=244) DEBUG 04-22 00:08:46 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=244) DEBUG 04-22 00:08:46 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=244) DEBUG 04-22 00:08:47 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/9bfd138913/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=244) DEBUG 04-22 00:08:48 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=244) DEBUG 04-22 00:08:48 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms -(EngineCore pid=244) DEBUG 04-22 00:08:48 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms -(EngineCore pid=244) DEBUG 04-22 00:08:48 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=244) DEBUG 04-22 00:08:48 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(EngineCore pid=244) DEBUG 04-22 00:08:49 [compilation/backends.py:377] Store the 32-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_32', '/data/.cache/vllm/torch_compile_cache/9bfd138913/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_32') -(EngineCore pid=244) INFO 04-22 00:08:49 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.10 s -(EngineCore pid=244) DEBUG 04-22 00:08:49 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/9bfd138913/rank_0_0/backbone/computation_graph.py -(EngineCore pid=244) INFO 04-22 00:08:50 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/5785960f71019408b49f0614e6897e9613551919aefdda4bdab95f0e33283b91/rank_0_0/model -(EngineCore pid=244) INFO 04-22 00:08:50 [compilation/monitor.py:48] torch.compile took 10.81 s in total -(EngineCore pid=244) INFO 04-22 00:08:53 [compilation/monitor.py:76] Initial profiling/warmup run took 2.57 s -(APIServer pid=1) DEBUG 04-22 00:08:55 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=244) INFO 04-22 00:08:58 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=244) DEBUG 04-22 00:08:58 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=244) INFO 04-22 00:08:58 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=244) DEBUG 04-22 00:08:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:08:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:08:58 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-22 00:08:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:08:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:08:59 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-22 00:08:59 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 224.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=244) DEBUG 04-22 00:08:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:08:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:08:59 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-22 00:09:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:09:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:09:00 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-22 00:09:00 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 4.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=244) DEBUG 04-22 00:09:00 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=244) INFO 04-22 00:09:00 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.80 GiB total -(EngineCore pid=244) DEBUG 04-22 00:09:01 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=244) DEBUG 04-22 00:09:01 [v1/worker/gpu_worker.py:430] Free memory after profiling: 46.29 GiB (total), 42.84 GiB (within requested) -(EngineCore pid=244) DEBUG 04-22 00:09:01 [v1/worker/gpu_worker.py:435] Memory profiling takes 21.36 seconds. Total non KV cache memory: 32.43GiB; torch peak memory increase: 2.21GiB; non-torch forward increase memory: 0.25GiB; weights memory: 29.98GiB. -(EngineCore pid=244) INFO 04-22 00:09:01 [v1/worker/gpu_worker.py:436] Available KV cache memory: 42.8 GiB -(EngineCore pid=244) INFO 04-22 00:09:01 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9602 to maintain the same effective KV cache size. -(EngineCore pid=244) INFO 04-22 00:09:01 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 175,296 tokens -(EngineCore pid=244) INFO 04-22 00:09:01 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 21.40x -(EngineCore pid=244) 2026-04-22 00:09:01,121 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=244) DEBUG 04-22 00:09:01 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) 2026-04-22 00:09:01,135 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=244) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:44:53 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:44:53 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 01:44:53 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:44:53 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 01:44:53 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 01:44:53 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-3.1-8B-Instruct -(APIServer pid=1) INFO 04-22 01:44:53 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 01:44:53 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:44:53 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-3.1-8B-Instruct', 'model': 'meta-llama/Llama-3.1-8B-Instruct', 'max_model_len': 2048, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 01:44:53 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 01:44:54 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 01:44:54 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0025746 secs -(APIServer pid=1) INFO 04-22 01:44:54 [config/model.py:549] Resolved architecture: LlamaForCausalLM -(APIServer pid=1) INFO 04-22 01:44:54 [config/model.py:1678] Using max model len 2048 -(APIServer pid=1) DEBUG 04-22 01:44:54 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 01:44:54 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 01:44:54 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 01:44:54 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 01:44:54 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 01:44:54 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 01:44:54 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 01:44:54 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 01:44:54 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:44:54 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:44:54 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 01:44:58 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:44:58 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:44:58 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:44:58 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:44:58 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:44:58 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:44:58 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:44:58 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:44:58 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:44:58 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:44:58 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:44:58 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:45:03 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=243) DEBUG 04-22 01:45:04 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 01:45:04 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=243) DEBUG 04-22 01:45:04 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/7440d862-18f2-4f83-93ee-efde0faecd74'], outputs=['ipc:///tmp/f5b0c178-cd3a-4ae6-875f-cf85ae54d43b'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=243) DEBUG 04-22 01:45:04 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=243) DEBUG 04-22 01:45:04 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=243) DEBUG 04-22 01:45:04 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=243) DEBUG 04-22 01:45:04 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=243) DEBUG 04-22 01:45:04 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=243) INFO 04-22 01:45:04 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=243) DEBUG 04-22 01:45:05 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.35:48997 backend=nccl -(EngineCore pid=243) INFO 04-22 01:45:05 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.35:48997 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=243) DEBUG 04-22 01:45:05 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=243) INFO 04-22 01:45:05 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=243) DEBUG 04-22 01:45:05 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776822305.5706387, auto_measure=True -(EngineCore pid=243) DEBUG 04-22 01:45:05 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=243) DEBUG 04-22 01:45:05 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=243) DEBUG 04-22 01:45:05 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=243) DEBUG 04-22 01:45:05 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=243) DEBUG 04-22 01:45:05 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=243) DEBUG 04-22 01:45:05 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=243) DEBUG 04-22 01:45:05 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=243) DEBUG 04-22 01:45:05 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=243) INFO 04-22 01:45:05 [v1/worker/gpu_model_runner.py:4735] Starting to load model meta-llama/Llama-3.1-8B-Instruct... -(EngineCore pid=243) DEBUG 04-22 01:45:06 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=243) INFO 04-22 01:45:06 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=243) INFO 04-22 01:45:06 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=243) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=243) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=243) DEBUG 04-22 01:45:06 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=243) DEBUG 04-22 01:45:06 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=243) DEBUG 04-22 01:45:06 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=243) DEBUG 04-22 01:45:06 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=243) DEBUG 04-22 01:45:06 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00001-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00002-of-00004.safetensors']] -(EngineCore pid=243) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=243) INFO 04-22 01:45:23 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/42b409dd87/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=0252433631 comp=e546579c48 code=d4902a9f99bbc2392c45e4b4dc801f4424a11306edb95a61851b1062db20b8c4 dir=/data/.cache/vllm/torch_compile_cache/42b409dd87/rank_0_0/backbone -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/backends.py:1074] Vllm config hash: 0252433631 -(EngineCore pid=243) INFO 04-22 01:45:23 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.31 s -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=243) DEBUG 04-22 01:45:23 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=243) DEBUG 04-22 01:45:24 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 01:45:24 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(EngineCore pid=243) DEBUG 04-22 01:45:24 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms -(EngineCore pid=243) DEBUG 04-22 01:45:24 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 01:45:24 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(APIServer pid=1) DEBUG 04-22 01:45:24 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=243) INFO 04-22 01:45:26 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=243) DEBUG 04-22 01:45:26 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/42b409dd87/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=243) DEBUG 04-22 01:45:26 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 01:45:26 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(EngineCore pid=243) DEBUG 04-22 01:45:26 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms -(EngineCore pid=243) DEBUG 04-22 01:45:26 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 01:45:26 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=243) DEBUG 04-22 01:45:27 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/42b409dd87/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=243) DEBUG 04-22 01:45:28 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 01:45:28 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms -(EngineCore pid=243) DEBUG 04-22 01:45:28 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms -(EngineCore pid=243) DEBUG 04-22 01:45:28 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 01:45:28 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(EngineCore pid=243) DEBUG 04-22 01:45:28 [compilation/backends.py:377] Store the 32-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_32', '/data/.cache/vllm/torch_compile_cache/42b409dd87/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_32') -(EngineCore pid=243) INFO 04-22 01:45:28 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.21 s -(EngineCore pid=243) DEBUG 04-22 01:45:29 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/42b409dd87/rank_0_0/backbone/computation_graph.py -(EngineCore pid=243) INFO 04-22 01:45:30 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/6a79de39180cf098b701309e28d6e3149e650bda89dc5a00214c5ae0d896a5de/rank_0_0/model -(EngineCore pid=243) INFO 04-22 01:45:30 [compilation/monitor.py:48] torch.compile took 10.96 s in total -(EngineCore pid=243) INFO 04-22 01:45:30 [compilation/monitor.py:76] Initial profiling/warmup run took 0.45 s -(APIServer pid=1) DEBUG 04-22 01:45:34 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=243) INFO 04-22 01:45:36 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=243) DEBUG 04-22 01:45:36 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=243) INFO 04-22 01:45:36 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=243) DEBUG 04-22 01:45:36 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:45:36 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:45:36 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 01:45:36 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:45:36 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:45:36 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 01:45:36 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 138.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=243) DEBUG 04-22 01:45:36 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:45:36 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:45:36 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 01:45:36 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:45:36 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:45:36 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 01:45:36 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 260.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=243) DEBUG 04-22 01:45:36 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=243) INFO 04-22 01:45:36 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.84 GiB total -(EngineCore pid=243) DEBUG 04-22 01:45:37 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=243) DEBUG 04-22 01:45:37 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.26 GiB (total), 58.81 GiB (within requested) -(EngineCore pid=243) DEBUG 04-22 01:45:37 [v1/worker/gpu_worker.py:435] Memory profiling takes 17.92 seconds. Total non KV cache memory: 17.12GiB; torch peak memory increase: 1.89GiB; non-torch forward increase memory: 0.25GiB; weights memory: 14.99GiB. -(EngineCore pid=243) INFO 04-22 01:45:37 [v1/worker/gpu_worker.py:436] Available KV cache memory: 58.11 GiB -(EngineCore pid=243) INFO 04-22 01:45:37 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9606 to maintain the same effective KV cache size. -(EngineCore pid=243) INFO 04-22 01:45:37 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 476,016 tokens -(EngineCore pid=243) INFO 04-22 01:45:37 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 2,048 tokens per request: 232.43x -(EngineCore pid=243) 2026-04-22 01:45:37,202 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=243) DEBUG 04-22 01:45:37 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) 2026-04-22 01:45:37,212 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=243) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:46:04 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:46:04 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 01:46:04 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:46:04 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 01:46:04 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 01:46:04 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-3.1-8B-Instruct -(APIServer pid=1) INFO 04-22 01:46:04 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 01:46:04 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:46:04 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-3.1-8B-Instruct', 'model': 'meta-llama/Llama-3.1-8B-Instruct', 'max_model_len': 4096, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 01:46:04 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 01:46:04 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 01:46:04 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003769 secs -(APIServer pid=1) INFO 04-22 01:46:04 [config/model.py:549] Resolved architecture: LlamaForCausalLM -(APIServer pid=1) INFO 04-22 01:46:04 [config/model.py:1678] Using max model len 4096 -(APIServer pid=1) DEBUG 04-22 01:46:04 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 01:46:04 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 01:46:04 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 01:46:04 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 01:46:04 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 01:46:04 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 01:46:04 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 01:46:05 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 01:46:05 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:46:05 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:46:05 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 01:46:09 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:46:09 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:46:09 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:46:09 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:46:09 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:46:09 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:46:09 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:46:09 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:46:09 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:46:09 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:46:09 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:46:09 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:46:13 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=243) DEBUG 04-22 01:46:15 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 01:46:15 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=243) DEBUG 04-22 01:46:15 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/779c182c-4761-4a4d-9b5c-a83785404ad2'], outputs=['ipc:///tmp/afe7f37a-1eb4-4819-a4e6-d191162b838c'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=243) DEBUG 04-22 01:46:15 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=243) DEBUG 04-22 01:46:15 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=243) DEBUG 04-22 01:46:15 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=243) DEBUG 04-22 01:46:15 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=243) DEBUG 04-22 01:46:15 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=243) INFO 04-22 01:46:15 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=243) DEBUG 04-22 01:46:15 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.36:39353 backend=nccl -(EngineCore pid=243) INFO 04-22 01:46:15 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.36:39353 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=243) DEBUG 04-22 01:46:15 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=243) INFO 04-22 01:46:15 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=243) DEBUG 04-22 01:46:16 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776822376.346393, auto_measure=True -(EngineCore pid=243) DEBUG 04-22 01:46:16 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=243) DEBUG 04-22 01:46:16 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=243) DEBUG 04-22 01:46:16 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=243) DEBUG 04-22 01:46:16 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=243) DEBUG 04-22 01:46:16 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=243) DEBUG 04-22 01:46:16 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=243) DEBUG 04-22 01:46:16 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=243) DEBUG 04-22 01:46:16 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=243) INFO 04-22 01:46:16 [v1/worker/gpu_model_runner.py:4735] Starting to load model meta-llama/Llama-3.1-8B-Instruct... -(EngineCore pid=243) DEBUG 04-22 01:46:17 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=243) INFO 04-22 01:46:17 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=243) INFO 04-22 01:46:17 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=243) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=243) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=243) DEBUG 04-22 01:46:17 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=243) DEBUG 04-22 01:46:17 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=243) DEBUG 04-22 01:46:17 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=243) DEBUG 04-22 01:46:17 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=243) DEBUG 04-22 01:46:17 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00001-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00002-of-00004.safetensors', 'model-00004-of-00004.safetensors']] -(EngineCore pid=243) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 -(APIServer pid=1) DEBUG 04-22 01:46:25 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=243) INFO 04-22 01:46:29 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/a3ffc11007/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=1b824e05d5 comp=e546579c48 code=d4902a9f99bbc2392c45e4b4dc801f4424a11306edb95a61851b1062db20b8c4 dir=/data/.cache/vllm/torch_compile_cache/a3ffc11007/rank_0_0/backbone -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/backends.py:1074] Vllm config hash: 1b824e05d5 -(EngineCore pid=243) INFO 04-22 01:46:29 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.35 s -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 01:46:29 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=243) INFO 04-22 01:46:31 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=243) DEBUG 04-22 01:46:31 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/a3ffc11007/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=243) DEBUG 04-22 01:46:31 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 01:46:31 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(EngineCore pid=243) DEBUG 04-22 01:46:31 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms -(EngineCore pid=243) DEBUG 04-22 01:46:31 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 01:46:31 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=243) DEBUG 04-22 01:46:33 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/a3ffc11007/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=243) DEBUG 04-22 01:46:34 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 01:46:34 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms -(EngineCore pid=243) DEBUG 04-22 01:46:34 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms -(EngineCore pid=243) DEBUG 04-22 01:46:34 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 01:46:34 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(EngineCore pid=243) DEBUG 04-22 01:46:34 [compilation/backends.py:377] Store the 32-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_32', '/data/.cache/vllm/torch_compile_cache/a3ffc11007/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_32') -(EngineCore pid=243) INFO 04-22 01:46:34 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.19 s -(EngineCore pid=243) DEBUG 04-22 01:46:34 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/a3ffc11007/rank_0_0/backbone/computation_graph.py -(APIServer pid=1) DEBUG 04-22 01:46:35 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=243) INFO 04-22 01:46:35 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/2dde39181b04ca928d91381047e0719bd31f31ec1f42285a0f371726d4389ad4/rank_0_0/model -(EngineCore pid=243) INFO 04-22 01:46:35 [compilation/monitor.py:48] torch.compile took 10.97 s in total -(EngineCore pid=243) INFO 04-22 01:46:36 [compilation/monitor.py:76] Initial profiling/warmup run took 0.45 s -(EngineCore pid=243) INFO 04-22 01:46:41 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=243) DEBUG 04-22 01:46:41 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=243) INFO 04-22 01:46:41 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=243) DEBUG 04-22 01:46:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:46:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:46:41 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 01:46:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:46:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:46:41 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 01:46:41 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 138.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=243) DEBUG 04-22 01:46:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:46:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:46:41 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 01:46:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:46:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:46:41 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 01:46:41 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 260.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=243) DEBUG 04-22 01:46:42 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=243) INFO 04-22 01:46:42 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.84 GiB total -(EngineCore pid=243) DEBUG 04-22 01:46:42 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=243) DEBUG 04-22 01:46:42 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.26 GiB (total), 58.81 GiB (within requested) -(EngineCore pid=243) DEBUG 04-22 01:46:42 [v1/worker/gpu_worker.py:435] Memory profiling takes 17.89 seconds. Total non KV cache memory: 17.12GiB; torch peak memory increase: 1.89GiB; non-torch forward increase memory: 0.25GiB; weights memory: 14.99GiB. -(EngineCore pid=243) INFO 04-22 01:46:42 [v1/worker/gpu_worker.py:436] Available KV cache memory: 58.11 GiB -(EngineCore pid=243) INFO 04-22 01:46:42 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9606 to maintain the same effective KV cache size. -(EngineCore pid=243) INFO 04-22 01:46:42 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 476,016 tokens -(EngineCore pid=243) INFO 04-22 01:46:42 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 4,096 tokens per request: 116.21x -(EngineCore pid=243) 2026-04-22 01:46:42,654 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=243) DEBUG 04-22 01:46:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) 2026-04-22 01:46:42,663 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=243) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:03:35 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:03:35 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 00:03:35 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:03:35 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 00:03:35 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 00:03:35 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-3.1-8B-Instruct -(APIServer pid=1) INFO 04-22 00:03:35 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 00:03:35 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:03:35 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-3.1-8B-Instruct', 'model': 'meta-llama/Llama-3.1-8B-Instruct', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 00:03:35 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 00:03:36 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 00:03:36 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0012799 secs -(APIServer pid=1) INFO 04-22 00:03:36 [config/model.py:549] Resolved architecture: LlamaForCausalLM -(APIServer pid=1) INFO 04-22 00:03:36 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 00:03:36 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 00:03:36 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 00:03:36 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 00:03:36 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 00:03:36 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 00:03:36 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 00:03:36 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 00:03:36 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 00:03:36 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:03:36 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:03:36 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 00:03:40 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:03:40 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:03:40 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:03:40 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:03:40 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:03:40 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:03:40 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:03:40 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:03:40 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:03:40 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:03:40 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:03:40 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:03:45 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=242) DEBUG 04-22 00:03:46 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 00:03:46 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=242) DEBUG 04-22 00:03:46 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/a6ae8f34-d228-453e-891a-3092bd7b0a4e'], outputs=['ipc:///tmp/af9ad280-429e-48b4-bc4c-a51022de6971'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=242) DEBUG 04-22 00:03:46 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=242) DEBUG 04-22 00:03:46 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=242) DEBUG 04-22 00:03:46 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=242) DEBUG 04-22 00:03:46 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=242) DEBUG 04-22 00:03:46 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=242) INFO 04-22 00:03:46 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=242) DEBUG 04-22 00:03:47 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.4.185:60191 backend=nccl -(EngineCore pid=242) INFO 04-22 00:03:47 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.4.185:60191 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=242) DEBUG 04-22 00:03:47 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=242) INFO 04-22 00:03:47 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=242) DEBUG 04-22 00:03:47 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776816227.5468106, auto_measure=True -(EngineCore pid=242) DEBUG 04-22 00:03:47 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=242) DEBUG 04-22 00:03:47 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=242) DEBUG 04-22 00:03:47 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=242) DEBUG 04-22 00:03:47 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=242) DEBUG 04-22 00:03:47 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=242) DEBUG 04-22 00:03:47 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=242) DEBUG 04-22 00:03:47 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=242) DEBUG 04-22 00:03:47 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=242) INFO 04-22 00:03:47 [v1/worker/gpu_model_runner.py:4735] Starting to load model meta-llama/Llama-3.1-8B-Instruct... -(EngineCore pid=242) DEBUG 04-22 00:03:48 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=242) INFO 04-22 00:03:48 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=242) INFO 04-22 00:03:48 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=242) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=242) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=242) DEBUG 04-22 00:03:48 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=242) DEBUG 04-22 00:03:48 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=242) DEBUG 04-22 00:03:48 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=242) DEBUG 04-22 00:03:48 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=242) DEBUG 04-22 00:03:48 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'model-00004-of-00004.safetensors']] -(EngineCore pid=242) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 -(APIServer pid=1) DEBUG 04-22 00:04:06 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=242) INFO 04-22 00:04:07 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/b9570f3049/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=5e2afef975 comp=e546579c48 code=d4902a9f99bbc2392c45e4b4dc801f4424a11306edb95a61851b1062db20b8c4 dir=/data/.cache/vllm/torch_compile_cache/b9570f3049/rank_0_0/backbone -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/backends.py:1074] Vllm config hash: 5e2afef975 -(EngineCore pid=242) INFO 04-22 00:04:07 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.29 s -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=242) DEBUG 04-22 00:04:07 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=242) DEBUG 04-22 00:04:08 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=242) DEBUG 04-22 00:04:08 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms -(EngineCore pid=242) DEBUG 04-22 00:04:08 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.3 ms -(EngineCore pid=242) DEBUG 04-22 00:04:08 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=242) DEBUG 04-22 00:04:08 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=242) INFO 04-22 00:04:09 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=242) DEBUG 04-22 00:04:09 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/b9570f3049/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=242) DEBUG 04-22 00:04:10 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=242) DEBUG 04-22 00:04:10 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(EngineCore pid=242) DEBUG 04-22 00:04:10 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms -(EngineCore pid=242) DEBUG 04-22 00:04:10 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=242) DEBUG 04-22 00:04:10 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=242) DEBUG 04-22 00:04:11 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/b9570f3049/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=242) DEBUG 04-22 00:04:12 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=242) DEBUG 04-22 00:04:12 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms -(EngineCore pid=242) DEBUG 04-22 00:04:12 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms -(EngineCore pid=242) DEBUG 04-22 00:04:12 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=242) DEBUG 04-22 00:04:12 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(EngineCore pid=242) DEBUG 04-22 00:04:12 [compilation/backends.py:377] Store the 32-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_32', '/data/.cache/vllm/torch_compile_cache/b9570f3049/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_32') -(EngineCore pid=242) INFO 04-22 00:04:12 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.13 s -(EngineCore pid=242) DEBUG 04-22 00:04:12 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/b9570f3049/rank_0_0/backbone/computation_graph.py -(EngineCore pid=242) INFO 04-22 00:04:13 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/346f76857706c0537d5aef76aa2af17185a22023d838afba9a1d18619c755486/rank_0_0/model -(EngineCore pid=242) INFO 04-22 00:04:13 [compilation/monitor.py:48] torch.compile took 10.84 s in total -(EngineCore pid=242) INFO 04-22 00:04:14 [compilation/monitor.py:76] Initial profiling/warmup run took 0.46 s -(APIServer pid=1) DEBUG 04-22 00:04:16 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=242) INFO 04-22 00:04:19 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=242) DEBUG 04-22 00:04:19 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=242) INFO 04-22 00:04:19 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=242) DEBUG 04-22 00:04:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=242) DEBUG 04-22 00:04:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=242) DEBUG 04-22 00:04:20 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=242) DEBUG 04-22 00:04:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=242) DEBUG 04-22 00:04:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=242) DEBUG 04-22 00:04:20 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=242) DEBUG 04-22 00:04:20 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 138.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=242) DEBUG 04-22 00:04:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=242) DEBUG 04-22 00:04:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=242) DEBUG 04-22 00:04:20 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=242) DEBUG 04-22 00:04:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=242) DEBUG 04-22 00:04:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=242) DEBUG 04-22 00:04:20 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=242) DEBUG 04-22 00:04:20 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 260.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=242) DEBUG 04-22 00:04:20 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=242) INFO 04-22 00:04:20 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.84 GiB total -(EngineCore pid=242) DEBUG 04-22 00:04:20 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=242) DEBUG 04-22 00:04:20 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.24 GiB (total), 58.79 GiB (within requested) -(EngineCore pid=242) DEBUG 04-22 00:04:20 [v1/worker/gpu_worker.py:435] Memory profiling takes 17.96 seconds. Total non KV cache memory: 17.12GiB; torch peak memory increase: 1.89GiB; non-torch forward increase memory: 0.25GiB; weights memory: 14.99GiB. -(EngineCore pid=242) INFO 04-22 00:04:20 [v1/worker/gpu_worker.py:436] Available KV cache memory: 58.11 GiB -(EngineCore pid=242) INFO 04-22 00:04:20 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9606 to maintain the same effective KV cache size. -(EngineCore pid=242) INFO 04-22 00:04:20 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 476,016 tokens -(EngineCore pid=242) INFO 04-22 00:04:20 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 58.11x -(EngineCore pid=242) 2026-04-22 00:04:20,974 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=242) DEBUG 04-22 00:04:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=242) 2026-04-22 00:04:20,983 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=242) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:41:28 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:41:28 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -INFO 04-22 01:41:28 [entrypoints/cli/serve.py:101] Defaulting api_server_count to data_parallel_size (2). -DEBUG 04-22 01:41:28 [v1/metrics/prometheus.py:27] Created PROMETHEUS_MULTIPROC_DIR at /tmp/tmphdhy4way -INFO 04-22 01:41:28 [entrypoints/utils.py:299] -INFO 04-22 01:41:28 [entrypoints/utils.py:299] █ █ █▄ ▄█ -INFO 04-22 01:41:28 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -INFO 04-22 01:41:28 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-3.1-8B-Instruct -INFO 04-22 01:41:28 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -INFO 04-22 01:41:28 [entrypoints/utils.py:299] -INFO 04-22 01:41:28 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-3.1-8B-Instruct', 'api_server_count': 2, 'model': 'meta-llama/Llama-3.1-8B-Instruct', 'max_model_len': 8192, 'data_parallel_size': 2, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -WARNING 04-22 01:41:28 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -DEBUG 04-22 01:41:28 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache -DEBUG 04-22 01:41:28 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0013756 secs -INFO 04-22 01:41:28 [config/model.py:549] Resolved architecture: LlamaForCausalLM -INFO 04-22 01:41:28 [config/model.py:1678] Using max model len 8192 -DEBUG 04-22 01:41:28 [config/model.py:1743] Generative models support chunked prefill. -DEBUG 04-22 01:41:28 [config/model.py:1801] Generative models support prefix caching. -DEBUG 04-22 01:41:28 [engine/arg_utils.py:2116] Enabling chunked prefill by default -DEBUG 04-22 01:41:28 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -DEBUG 04-22 01:41:28 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -DEBUG 04-22 01:41:28 [config/parallel.py:743] Defaulting to use mp for distributed inference -INFO 04-22 01:41:28 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -INFO 04-22 01:41:28 [config/vllm.py:790] Asynchronous scheduling is enabled. -DEBUG 04-22 01:41:32 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:41:32 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:41:32 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:41:32 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:41:32 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:41:32 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:41:32 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:41:32 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:41:32 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:41:32 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:41:32 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:41:32 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:41:37 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -INFO 04-22 01:41:38 [v1/engine/utils.py:914] Started DP Coordinator process (PID: 239) -INFO 04-22 01:41:38 [v1/utils.py:223] Started 2 API server processes -DEBUG 04-22 01:41:41 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:41:41 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:41:41 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:41:41 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:41:41 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:41:41 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:41:41 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:41:41 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:41:41 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:41:41 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:41:41 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:41:41 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:41:41 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:41:41 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:41:41 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:41:41 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:41:41 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:41:41 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:41:41 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:41:41 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:41:41 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:41:41 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:41:41 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:41:41 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:41:41 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:41:41 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:41:41 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:41:41 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:41:41 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:41:41 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:41:41 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:41:41 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:41:41 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:41:41 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:41:41 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:41:41 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:41:41 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:41:41 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:41:41 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:41:41 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:41:41 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:41:41 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:41:41 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:41:41 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:41:41 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:41:41 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:41:41 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:41:41 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:41:46 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:41:46 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:41:46 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:41:46 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore_DP0 pid=437) DEBUG 04-22 01:41:48 [v1/engine/core.py:1018] Waiting for init message from front-end. -DEBUG 04-22 01:41:48 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore_DP0 pid=437) DEBUG 04-22 01:41:48 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/79346ad6-6e10-42b0-95ff-725d8af58318', 'ipc:///tmp/76f7bc9d-9737-4b5a-849d-e58f7f34ac76'], outputs=['ipc:///tmp/739e9588-9318-470d-8b78-7c2a18de5ffa', 'ipc:///tmp/5b6feb5c-2bf1-4aa0-be99-632876126031'], coordinator_input='ipc:///tmp/45db5870-0521-4512-b1ea-dbb2fb7083e8', coordinator_output='ipc:///tmp/4b686f6c-d679-4419-bc33-a3231d0b59e6', frontend_stats_publish_address='ipc:///tmp/f21066c4-2444-497d-ae13-c1dd93d6f752'), parallel_config={}) -(EngineCore_DP0 pid=437) DEBUG 04-22 01:41:48 [v1/engine/core.py:826] Has DP Coordinator: True, stats publish address: ipc:///tmp/f21066c4-2444-497d-ae13-c1dd93d6f752 -(EngineCore_DP0 pid=437) DEBUG 04-22 01:41:48 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore_DP0 pid=437) DEBUG 04-22 01:41:48 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore_DP0 pid=437) DEBUG 04-22 01:41:48 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore_DP0 pid=437) DEBUG 04-22 01:41:48 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore_DP0 pid=437) INFO 04-22 01:41:48 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore_DP0 pid=437) WARNING 04-22 01:41:48 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. -(EngineCore_DP0 pid=437) INFO 04-22 01:41:48 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.129.9.34 (local), world_size=1, local_world_size=1 -(EngineCore_DP0 pid=437) DEBUG 04-22 01:41:48 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/65e715b1-9a8f-41c5-9ec9-075a5689f469 -(EngineCore_DP0 pid=437) DEBUG 04-22 01:41:48 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 16777216, 10, 'psm_3dd4e55a'), local_subscribe_addr='ipc:///tmp/65e715b1-9a8f-41c5-9ec9-075a5689f469', local_notify_addr='ipc:///tmp/ac8cd9fe-f882-4792-b609-73f8cc090691', remote_subscribe_addr=None, remote_addr_ipv6=False) -(EngineCore_DP1 pid=438) DEBUG 04-22 01:41:48 [v1/engine/core.py:1018] Waiting for init message from front-end. -DEBUG 04-22 01:41:48 [v1/engine/utils.py:1158] HELLO from local core engine process 1. -(EngineCore_DP1 pid=438) DEBUG 04-22 01:41:48 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/79346ad6-6e10-42b0-95ff-725d8af58318', 'ipc:///tmp/76f7bc9d-9737-4b5a-849d-e58f7f34ac76'], outputs=['ipc:///tmp/739e9588-9318-470d-8b78-7c2a18de5ffa', 'ipc:///tmp/5b6feb5c-2bf1-4aa0-be99-632876126031'], coordinator_input='ipc:///tmp/45db5870-0521-4512-b1ea-dbb2fb7083e8', coordinator_output='ipc:///tmp/4b686f6c-d679-4419-bc33-a3231d0b59e6', frontend_stats_publish_address='ipc:///tmp/f21066c4-2444-497d-ae13-c1dd93d6f752'), parallel_config={}) -(EngineCore_DP1 pid=438) DEBUG 04-22 01:41:48 [v1/engine/core.py:826] Has DP Coordinator: True, stats publish address: ipc:///tmp/f21066c4-2444-497d-ae13-c1dd93d6f752 -(EngineCore_DP1 pid=438) DEBUG 04-22 01:41:48 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore_DP1 pid=438) DEBUG 04-22 01:41:48 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore_DP1 pid=438) DEBUG 04-22 01:41:48 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore_DP1 pid=438) DEBUG 04-22 01:41:48 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore_DP1 pid=438) WARNING 04-22 01:41:48 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. -(EngineCore_DP1 pid=438) INFO 04-22 01:41:48 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.129.9.34 (local), world_size=1, local_world_size=1 -(EngineCore_DP1 pid=438) DEBUG 04-22 01:41:48 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/f5efeb96-b1c1-4784-a3f8-e05ffaf8af6c -(EngineCore_DP1 pid=438) DEBUG 04-22 01:41:48 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0], buffer_handle=(1, 16777216, 10, 'psm_b418ed59'), local_subscribe_addr='ipc:///tmp/f5efeb96-b1c1-4784-a3f8-e05ffaf8af6c', local_notify_addr='ipc:///tmp/742ab49c-d494-48b1-b8cd-3d76bc9e2181', remote_subscribe_addr=None, remote_addr_ipv6=False) -(ApiServer_1 pid=440) DEBUG 04-22 01:41:48 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(ApiServer_1 pid=440) DEBUG 04-22 01:41:48 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(ApiServer_1 pid=440) DEBUG 04-22 01:41:48 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(ApiServer_1 pid=440) DEBUG 04-22 01:41:48 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(ApiServer_1 pid=440) WARNING 04-22 01:41:48 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(ApiServer_0 pid=439) DEBUG 04-22 01:41:48 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(ApiServer_0 pid=439) DEBUG 04-22 01:41:48 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(ApiServer_0 pid=439) DEBUG 04-22 01:41:48 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(ApiServer_0 pid=439) DEBUG 04-22 01:41:48 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(ApiServer_0 pid=439) WARNING 04-22 01:41:48 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(ApiServer_0 pid=439) DEBUG 04-22 01:41:48 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache -(ApiServer_0 pid=439) DEBUG 04-22 01:41:48 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0005344 secs -(ApiServer_0 pid=439) INFO 04-22 01:41:48 [config/model.py:549] Resolved architecture: LlamaForCausalLM -(ApiServer_0 pid=439) INFO 04-22 01:41:48 [config/model.py:1678] Using max model len 8192 -(ApiServer_0 pid=439) DEBUG 04-22 01:41:48 [config/model.py:1743] Generative models support chunked prefill. -(ApiServer_0 pid=439) DEBUG 04-22 01:41:48 [config/model.py:1801] Generative models support prefix caching. -(ApiServer_0 pid=439) DEBUG 04-22 01:41:48 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(ApiServer_0 pid=439) DEBUG 04-22 01:41:48 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(ApiServer_0 pid=439) DEBUG 04-22 01:41:48 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(ApiServer_0 pid=439) DEBUG 04-22 01:41:48 [config/parallel.py:743] Defaulting to use mp for distributed inference -(ApiServer_0 pid=439) INFO 04-22 01:41:48 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(ApiServer_0 pid=439) INFO 04-22 01:41:48 [config/vllm.py:790] Asynchronous scheduling is enabled. -(ApiServer_1 pid=440) DEBUG 04-22 01:41:48 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache -(ApiServer_1 pid=440) DEBUG 04-22 01:41:48 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0004159 secs -(ApiServer_1 pid=440) INFO 04-22 01:41:48 [config/model.py:549] Resolved architecture: LlamaForCausalLM -(ApiServer_1 pid=440) INFO 04-22 01:41:48 [config/model.py:1678] Using max model len 8192 -(ApiServer_1 pid=440) DEBUG 04-22 01:41:48 [config/model.py:1743] Generative models support chunked prefill. -(ApiServer_1 pid=440) DEBUG 04-22 01:41:48 [config/model.py:1801] Generative models support prefix caching. -(ApiServer_1 pid=440) DEBUG 04-22 01:41:48 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(ApiServer_1 pid=440) DEBUG 04-22 01:41:48 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(ApiServer_1 pid=440) DEBUG 04-22 01:41:48 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(ApiServer_1 pid=440) DEBUG 04-22 01:41:48 [config/parallel.py:743] Defaulting to use mp for distributed inference -(ApiServer_1 pid=440) INFO 04-22 01:41:48 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(ApiServer_1 pid=440) INFO 04-22 01:41:48 [config/vllm.py:790] Asynchronous scheduling is enabled. -(ApiServer_0 pid=439) DEBUG 04-22 01:41:48 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(ApiServer_0 pid=439) DEBUG 04-22 01:41:48 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(ApiServer_1 pid=440) DEBUG 04-22 01:41:49 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(ApiServer_1 pid=440) DEBUG 04-22 01:41:49 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(ApiServer_0 pid=439) DEBUG 04-22 01:41:49 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(ApiServer_0 pid=439) DEBUG 04-22 01:41:49 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -(ApiServer_1 pid=440) DEBUG 04-22 01:41:49 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(ApiServer_1 pid=440) DEBUG 04-22 01:41:49 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 01:41:51 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:41:51 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:41:51 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:41:51 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:41:51 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:41:51 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:41:51 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:41:51 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:41:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:41:51 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:41:51 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:41:51 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:41:51 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:41:51 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:41:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:41:51 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:41:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:41:51 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:41:51 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:41:51 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:41:51 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:41:51 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:41:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:41:51 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:41:56 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:41:56 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:41:57 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:41:57 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:41:57 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:41:57 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 01:41:57 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:41:57 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:41:57 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:41:57 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 01:41:58 [v1/engine/utils.py:1047] Waiting for 2 local, 0 remote core engine proc(s) to start. -(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] WorkerProc failed to start. -(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] Traceback (most recent call last): -(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 826, in worker_main -(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] worker = WorkerProc(*args, **kwargs) -(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) -(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ -(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 605, in __init__ -(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] self.worker.init_device() -(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 312, in init_device -(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] self.worker.init_device() # type: ignore -(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) -(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ -(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 242, in init_device -(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] assert self.local_rank < torch.accelerator.device_count(), ( -(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker pid=1224) ERROR 04-22 01:41:58 [v1/executor/multiproc_executor.py:857] AssertionError: DP adjusted local rank 1 is out of bounds. -(EngineCore_DP1 pid=438) DEBUG 04-22 01:41:58 [v1/executor/multiproc_executor.py:419] Worker Termination: allow workers to gracefully shutdown -(Worker pid=1219) DEBUG 04-22 01:41:58 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:33047 backend=nccl -(Worker pid=1219) INFO 04-22 01:41:58 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:33047 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(Worker pid=1219) DEBUG 04-22 01:41:58 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(Worker pid=1219) INFO 04-22 01:41:58 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(Worker pid=1219) DEBUG 04-22 01:41:58 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776822118.8622458, auto_measure=True -(Worker pid=1219) DEBUG 04-22 01:41:58 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=1219) DEBUG 04-22 01:41:58 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=1219) DEBUG 04-22 01:41:58 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=1219) DEBUG 04-22 01:41:58 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=1219) DEBUG 04-22 01:41:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=1219) DEBUG 04-22 01:41:59 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(Worker pid=1219) DEBUG 04-22 01:41:59 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=1219) DEBUG 04-22 01:41:59 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(Worker pid=1219) INFO 04-22 01:41:59 [v1/worker/gpu_model_runner.py:4735] Starting to load model meta-llama/Llama-3.1-8B-Instruct... -(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] EngineCore failed to start. -(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] Traceback (most recent call last): -(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1082, in run_engine_core -(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs) -(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] return func(*args, **kwargs) -(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^ -(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 848, in __init__ -(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] super().__init__( -(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 114, in __init__ -(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] self.model_executor = executor_class(vllm_config) -(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 101, in __init__ -(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] super().__init__(vllm_config) -(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] return func(*args, **kwargs) -(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^ -(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 103, in __init__ -(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] self._init_executor() -(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 190, in _init_executor -(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] self.workers = WorkerProc.wait_for_ready(unready_workers) -(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 736, in wait_for_ready -(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] raise e from None -(EngineCore_DP1 pid=438) ERROR 04-22 01:41:59 [v1/engine/core.py:1108] Exception: WorkerProc initialization failed due to an exception in a background process. See stack trace for root cause. -(EngineCore_DP1 pid=438) Process EngineCore_DP1: -(EngineCore_DP1 pid=438) Traceback (most recent call last): -(EngineCore_DP1 pid=438) File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap -(EngineCore_DP1 pid=438) self.run() -(EngineCore_DP1 pid=438) File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run -(EngineCore_DP1 pid=438) self._target(*self._args, **self._kwargs) -(EngineCore_DP1 pid=438) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1112, in run_engine_core -(EngineCore_DP1 pid=438) raise e -(EngineCore_DP1 pid=438) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1082, in run_engine_core -(EngineCore_DP1 pid=438) engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs) -(EngineCore_DP1 pid=438) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(EngineCore_DP1 pid=438) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(EngineCore_DP1 pid=438) return func(*args, **kwargs) -(EngineCore_DP1 pid=438) ^^^^^^^^^^^^^^^^^^^^^ -(EngineCore_DP1 pid=438) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 848, in __init__ -(EngineCore_DP1 pid=438) super().__init__( -(EngineCore_DP1 pid=438) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 114, in __init__ -(EngineCore_DP1 pid=438) self.model_executor = executor_class(vllm_config) -(EngineCore_DP1 pid=438) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(EngineCore_DP1 pid=438) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 101, in __init__ -(EngineCore_DP1 pid=438) super().__init__(vllm_config) -(EngineCore_DP1 pid=438) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(EngineCore_DP1 pid=438) return func(*args, **kwargs) -(EngineCore_DP1 pid=438) ^^^^^^^^^^^^^^^^^^^^^ -(EngineCore_DP1 pid=438) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 103, in __init__ -(EngineCore_DP1 pid=438) self._init_executor() -(EngineCore_DP1 pid=438) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 190, in _init_executor -(EngineCore_DP1 pid=438) self.workers = WorkerProc.wait_for_ready(unready_workers) -(EngineCore_DP1 pid=438) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(EngineCore_DP1 pid=438) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 736, in wait_for_ready -(EngineCore_DP1 pid=438) raise e from None -(EngineCore_DP1 pid=438) Exception: WorkerProc initialization failed due to an exception in a background process. See stack trace for root cause. -(EngineCore_DP1 pid=438) DEBUG 04-22 01:41:59 [v1/executor/multiproc_executor.py:438] Triggering shutdown of workers -(Worker pid=1219) DEBUG 04-22 01:41:59 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(Worker pid=1219) INFO 04-22 01:41:59 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(Worker pid=1219) INFO 04-22 01:41:59 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(Worker pid=1219) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=1219) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=1219) DEBUG 04-22 01:41:59 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker pid=1219) DEBUG 04-22 01:41:59 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker pid=1219) DEBUG 04-22 01:41:59 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker pid=1219) DEBUG 04-22 01:41:59 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -Traceback (most recent call last): - File "/usr/local/bin/vllm", line 10, in - sys.exit(main()) - ^^^^^^ - File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main - args.dispatch_function(args) - File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 118, in cmd - run_multi_api_server(args) - File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 286, in run_multi_api_server - with launch_core_engines( - ^^^^^^^^^^^^^^^^^^^^ - File "/usr/lib/python3.12/contextlib.py", line 144, in __exit__ - next(self.gen) - File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 998, in launch_core_engines - wait_for_engine_startup( - File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 1057, in wait_for_engine_startup - raise RuntimeError( -RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} -(Worker pid=1219) DEBUG 04-22 01:42:00 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00003-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00002-of-00004.safetensors']] -(Worker pid=1219) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:42:32 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:42:32 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 01:42:32 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:42:32 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 01:42:32 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 01:42:32 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-3.1-8B-Instruct -(APIServer pid=1) INFO 04-22 01:42:32 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 01:42:32 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:42:32 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-3.1-8B-Instruct', 'model': 'meta-llama/Llama-3.1-8B-Instruct', 'max_model_len': 8192, 'pipeline_parallel_size': 2, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 01:42:32 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 01:42:33 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 01:42:33 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003599 secs -(APIServer pid=1) INFO 04-22 01:42:33 [config/model.py:549] Resolved architecture: LlamaForCausalLM -(APIServer pid=1) INFO 04-22 01:42:33 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 01:42:33 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 01:42:33 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 01:42:33 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 01:42:33 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 01:42:33 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 01:42:33 [config/parallel.py:743] Defaulting to use mp for distributed inference -(APIServer pid=1) INFO 04-22 01:42:33 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 01:42:33 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 01:42:33 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 01:42:33 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:42:33 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:42:33 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 01:42:37 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:42:37 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:42:37 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:42:37 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:42:37 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:42:37 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:42:37 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:42:37 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:42:37 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:42:37 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:42:37 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:42:37 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:42:42 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=243) DEBUG 04-22 01:42:43 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 01:42:43 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=243) DEBUG 04-22 01:42:43 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/6b7dad8b-6920-4893-bea2-725a72500b35'], outputs=['ipc:///tmp/dbf6128c-0c73-4293-9574-415241db12e2'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=243) DEBUG 04-22 01:42:43 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=243) DEBUG 04-22 01:42:43 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=243) DEBUG 04-22 01:42:43 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=243) DEBUG 04-22 01:42:43 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=243) DEBUG 04-22 01:42:43 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=243) INFO 04-22 01:42:43 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=2, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=243) WARNING 04-22 01:42:43 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. -(EngineCore pid=243) INFO 04-22 01:42:43 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.131.3.125 (local), world_size=2, local_world_size=2 -(EngineCore pid=243) DEBUG 04-22 01:42:43 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/4ada89ba-6925-4164-89e6-3295f56a409c -(EngineCore pid=243) DEBUG 04-22 01:42:43 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1], buffer_handle=(2, 16777216, 10, 'psm_c5cfa3c7'), local_subscribe_addr='ipc:///tmp/4ada89ba-6925-4164-89e6-3295f56a409c', local_notify_addr='ipc:///tmp/34fbc80b-d2fa-4999-8433-0eeed4ebbc91', remote_subscribe_addr=None, remote_addr_ipv6=False) -DEBUG 04-22 01:42:47 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:42:47 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:42:47 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:42:47 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:42:47 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:42:47 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:42:47 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:42:47 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:42:47 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:42:47 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:42:47 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:42:47 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:42:47 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:42:47 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:42:47 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:42:47 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:42:47 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:42:47 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:42:47 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:42:47 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:42:47 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:42:47 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:42:47 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:42:47 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:42:51 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:42:52 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:42:53 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:42:53 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:42:53 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:42:53 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) DEBUG 04-22 01:42:53 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -DEBUG 04-22 01:42:53 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:42:53 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:42:53 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:42:53 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(Worker pid=442) DEBUG 04-22 01:42:54 [distributed/parallel_state.py:1356] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:55875 backend=nccl -(Worker pid=442) INFO 04-22 01:42:54 [distributed/parallel_state.py:1400] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:55875 backend=nccl -(Worker pid=443) DEBUG 04-22 01:42:54 [distributed/parallel_state.py:1356] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:55875 backend=nccl -(Worker pid=443) INFO 04-22 01:42:54 [distributed/parallel_state.py:1400] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:55875 backend=nccl -[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -(Worker pid=442) DEBUG 04-22 01:42:54 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=443) DEBUG 04-22 01:42:54 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -(Worker pid=443) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=443) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=442) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=442) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=442) DEBUG 04-22 01:42:54 [utils/nccl.py:34] Found nccl from library libnccl.so.2 -(Worker pid=442) INFO 04-22 01:42:54 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(Worker pid=442) INFO 04-22 01:42:55 [distributed/parallel_state.py:1716] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(Worker pid=442) DEBUG 04-22 01:42:55 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=77.76GiB, total_memory=79.19GiB, cuda_memory=1.43GiB, torch_memory=0.0GiB, non_torch_memory=1.43GiB, timestamp=1776822175.437332, auto_measure=True -(Worker pid=442) DEBUG 04-22 01:42:55 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=443) DEBUG 04-22 01:42:55 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=77.76GiB, total_memory=79.19GiB, cuda_memory=1.43GiB, torch_memory=0.0GiB, non_torch_memory=1.43GiB, timestamp=1776822175.4582114, auto_measure=True -(Worker pid=443) DEBUG 04-22 01:42:55 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=442) DEBUG 04-22 01:42:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=442) DEBUG 04-22 01:42:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=442) DEBUG 04-22 01:42:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=443) DEBUG 04-22 01:42:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=443) DEBUG 04-22 01:42:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=443) DEBUG 04-22 01:42:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=442) DEBUG 04-22 01:42:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=442) DEBUG 04-22 01:42:55 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(Worker pid=442) DEBUG 04-22 01:42:55 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=443) DEBUG 04-22 01:42:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=443) DEBUG 04-22 01:42:55 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=442) DEBUG 04-22 01:42:55 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(Worker_PP0 pid=442) INFO 04-22 01:42:55 [v1/worker/gpu_model_runner.py:4735] Starting to load model meta-llama/Llama-3.1-8B-Instruct... -(Worker_PP0 pid=442) DEBUG 04-22 01:42:55 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(Worker_PP0 pid=442) INFO 04-22 01:42:55 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(Worker_PP0 pid=442) INFO 04-22 01:42:55 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(Worker_PP1 pid=443) DEBUG 04-22 01:42:56 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_PP1 pid=443) DEBUG 04-22 01:42:56 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_PP1 pid=443) DEBUG 04-22 01:42:56 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 33, 'silu_and_mul': 16, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_PP1 pid=443) DEBUG 04-22 01:42:56 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_PP0 pid=442) DEBUG 04-22 01:42:56 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_PP0 pid=442) DEBUG 04-22 01:42:56 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_PP0 pid=442) DEBUG 04-22 01:42:56 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 32, 'silu_and_mul': 16, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1}) -(Worker_PP0 pid=442) DEBUG 04-22 01:42:56 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_PP1 pid=443) DEBUG 04-22 01:42:56 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00004-of-00004.safetensors']] -(Worker_PP0 pid=442) DEBUG 04-22 01:42:56 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00004-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'model-00002-of-00004.safetensors']] -(Worker_PP0 pid=442) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 -(Worker_PP1 pid=443) DEBUG 04-22 01:43:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP1 pid=443) DEBUG 04-22 01:43:00 [compilation/decorators.py:528] Start compiling function -(EngineCore pid=243) DEBUG 04-22 01:43:01 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/sequence.py -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=f2b27d324e comp=e546579c48 code=7f2f48347ccdde3394423985176b64bbf478231b0e1845a35438330041d319b6 dir=/data/.cache/vllm/torch_compile_cache/639e59bee9/rank_1_0/backbone -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_PP1 pid=443) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] Vllm config hash: f2b27d324e -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/sequence.py -(Worker_PP0 pid=442) INFO 04-22 01:43:02 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/6749c2ea5a/rank_0_0/backbone for vLLM's torch.compile -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=fc2500db0a comp=e546579c48 code=7f24f63a7548e6fdf64e57e8c7d52119b84b1d43a0b72f3e4a2b71446193f817 dir=/data/.cache/vllm/torch_compile_cache/6749c2ea5a/rank_0_0/backbone -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_PP0 pid=442) DEBUG 04-22 01:43:02 [compilation/backends.py:1074] Vllm config hash: fc2500db0a -(Worker_PP0 pid=442) INFO 04-22 01:43:02 [compilation/backends.py:1111] Dynamo bytecode transform time: 2.48 s -(Worker_PP0 pid=442) DEBUG 04-22 01:43:03 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(Worker_PP0 pid=442) DEBUG 04-22 01:43:03 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(APIServer pid=1) DEBUG 04-22 01:43:03 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_PP1 pid=443) DEBUG 04-22 01:43:03 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_PP1 pid=443) DEBUG 04-22 01:43:03 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_PP1 pid=443) DEBUG 04-22 01:43:03 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_PP1 pid=443) DEBUG 04-22 01:43:03 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_PP1 pid=443) DEBUG 04-22 01:43:03 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_PP0 pid=442) DEBUG 04-22 01:43:03 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_PP0 pid=442) DEBUG 04-22 01:43:03 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_PP0 pid=442) DEBUG 04-22 01:43:03 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_PP0 pid=442) DEBUG 04-22 01:43:03 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_PP0 pid=442) DEBUG 04-22 01:43:03 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_PP0 pid=442) INFO 04-22 01:43:05 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(Worker_PP0 pid=442) DEBUG 04-22 01:43:05 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/6749c2ea5a/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(Worker_PP1 pid=443) DEBUG 04-22 01:43:06 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_PP1 pid=443) DEBUG 04-22 01:43:06 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_PP1 pid=443) DEBUG 04-22 01:43:06 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_PP1 pid=443) DEBUG 04-22 01:43:06 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_PP1 pid=443) DEBUG 04-22 01:43:06 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_PP0 pid=442) DEBUG 04-22 01:43:06 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_PP0 pid=442) DEBUG 04-22 01:43:06 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_PP0 pid=442) DEBUG 04-22 01:43:06 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_PP0 pid=442) DEBUG 04-22 01:43:06 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_PP0 pid=442) DEBUG 04-22 01:43:06 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_PP0 pid=442) DEBUG 04-22 01:43:07 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/6749c2ea5a/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(Worker_PP1 pid=443) DEBUG 04-22 01:43:07 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_PP1 pid=443) DEBUG 04-22 01:43:07 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_PP1 pid=443) DEBUG 04-22 01:43:07 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.6 ms -(Worker_PP1 pid=443) DEBUG 04-22 01:43:07 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_PP1 pid=443) DEBUG 04-22 01:43:07 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_PP0 pid=442) DEBUG 04-22 01:43:07 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_PP0 pid=442) DEBUG 04-22 01:43:07 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_PP0 pid=442) DEBUG 04-22 01:43:07 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_PP0 pid=442) DEBUG 04-22 01:43:07 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_PP0 pid=442) DEBUG 04-22 01:43:07 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_PP0 pid=442) DEBUG 04-22 01:43:08 [compilation/backends.py:377] Store the 16-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_16', '/data/.cache/vllm/torch_compile_cache/6749c2ea5a/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_16') -(Worker_PP0 pid=442) INFO 04-22 01:43:08 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.20 s -(Worker_PP0 pid=442) DEBUG 04-22 01:43:08 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/6749c2ea5a/rank_0_0/backbone/computation_graph.py -(Worker_PP0 pid=442) INFO 04-22 01:43:08 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/378a2f90d7ba67f1593b3e36e293b0841d472907a0cee2c9633e63eaed3ee443/rank_0_0/model -(Worker_PP0 pid=442) INFO 04-22 01:43:08 [compilation/monitor.py:48] torch.compile took 8.22 s in total -(Worker_PP0 pid=442) INFO 04-22 01:43:09 [compilation/monitor.py:76] Initial profiling/warmup run took 0.37 s -(Worker_PP0 pid=442) INFO 04-22 01:43:09 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_PP0 pid=442) DEBUG 04-22 01:43:09 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_PP0 pid=442) INFO 04-22 01:43:09 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_PP0 pid=442) DEBUG 04-22 01:43:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP0 pid=442) DEBUG 04-22 01:43:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP0 pid=442) DEBUG 04-22 01:43:09 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_PP0 pid=442) DEBUG 04-22 01:43:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP0 pid=442) DEBUG 04-22 01:43:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP0 pid=442) DEBUG 04-22 01:43:09 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_PP0 pid=442) DEBUG 04-22 01:43:09 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 114.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(Worker_PP0 pid=442) DEBUG 04-22 01:43:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP0 pid=442) DEBUG 04-22 01:43:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP0 pid=442) DEBUG 04-22 01:43:09 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_PP0 pid=442) DEBUG 04-22 01:43:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP0 pid=442) DEBUG 04-22 01:43:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP0 pid=442) DEBUG 04-22 01:43:09 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_PP0 pid=442) DEBUG 04-22 01:43:09 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 256.00 MiB first-capture + (51-1) × 2.00 MiB per-graph -(Worker_PP0 pid=442) DEBUG 04-22 01:43:10 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_PP0 pid=442) INFO 04-22 01:43:10 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.54 GiB total -(Worker_PP0 pid=442) DEBUG 04-22 01:43:10 [v1/worker/gpu_worker.py:424] Initial free memory: 77.76 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_PP0 pid=442) DEBUG 04-22 01:43:10 [v1/worker/gpu_worker.py:430] Free memory after profiling: 70.02 GiB (total), 67.49 GiB (within requested) -(Worker_PP0 pid=442) DEBUG 04-22 01:43:10 [v1/worker/gpu_worker.py:435] Memory profiling takes 9.87 seconds. Total non KV cache memory: 8.69GiB; torch peak memory increase: 1.1GiB; non-torch forward increase memory: 0.07GiB; weights memory: 7.51GiB. -(Worker_PP0 pid=442) INFO 04-22 01:43:10 [v1/worker/gpu_worker.py:436] Available KV cache memory: 66.54 GiB -(Worker_PP0 pid=442) INFO 04-22 01:43:10 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9569 to maintain the same effective KV cache size. -(Worker_PP0 pid=442) DEBUG 04-22 01:43:10 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=243) DEBUG 04-22 01:43:10 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=243) DEBUG 04-22 01:43:10 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(APIServer pid=1) DEBUG 04-22 01:43:13 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_PP1 pid=443) INFO 04-22 01:43:14 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_PP1 pid=443) DEBUG 04-22 01:43:14 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_PP1 pid=443) INFO 04-22 01:43:14 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_PP1 pid=443) DEBUG 04-22 01:43:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP1 pid=443) DEBUG 04-22 01:43:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP1 pid=443) DEBUG 04-22 01:43:14 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_PP1 pid=443) DEBUG 04-22 01:43:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP1 pid=443) DEBUG 04-22 01:43:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP1 pid=443) DEBUG 04-22 01:43:14 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_PP1 pid=443) DEBUG 04-22 01:43:14 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 134.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(Worker_PP1 pid=443) DEBUG 04-22 01:43:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP1 pid=443) DEBUG 04-22 01:43:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP1 pid=443) DEBUG 04-22 01:43:14 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_PP1 pid=443) DEBUG 04-22 01:43:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP1 pid=443) DEBUG 04-22 01:43:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP1 pid=443) DEBUG 04-22 01:43:14 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_PP1 pid=443) DEBUG 04-22 01:43:14 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 256.00 MiB first-capture + (51-1) × 2.00 MiB per-graph -(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_PP1 pid=443) INFO 04-22 01:43:15 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.54 GiB total -(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [v1/worker/gpu_worker.py:424] Initial free memory: 77.76 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [v1/worker/gpu_worker.py:430] Free memory after profiling: 69.8 GiB (total), 67.26 GiB (within requested) -(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [v1/worker/gpu_worker.py:435] Memory profiling takes 14.85 seconds. Total non KV cache memory: 9.6GiB; torch peak memory increase: 2.01GiB; non-torch forward increase memory: 0.07GiB; weights memory: 7.51GiB. -(Worker_PP1 pid=443) INFO 04-22 01:43:15 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9569 to maintain the same effective KV cache size. -(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=243) DEBUG 04-22 01:43:15 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=243) INFO 04-22 01:43:15 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 1,075,312 tokens -(EngineCore pid=243) INFO 04-22 01:43:15 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 131.26x -(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_PP0 pid=442) DEBUG 04-22 01:43:15 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=243) DEBUG 04-22 01:43:15 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_PP1 pid=443) 2026-04-22 01:43:15,416 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_PP0 pid=442) 2026-04-22 01:43:15,416 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP0 pid=442) DEBUG 04-22 01:43:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP1 pid=443) 2026-04-22 01:43:15,424 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_PP0 pid=442) 2026-04-22 01:43:15,424 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=480, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=464, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=448, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_PP1 pid=443) DEBUG 04-22 01:43:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP0 pid=442) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:43:37 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:43:37 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 01:43:37 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:43:37 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 01:43:37 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 01:43:37 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-3.1-8B-Instruct -(APIServer pid=1) INFO 04-22 01:43:37 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 01:43:37 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:43:37 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-3.1-8B-Instruct', 'model': 'meta-llama/Llama-3.1-8B-Instruct', 'max_model_len': 8192, 'pipeline_parallel_size': 4, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 01:43:37 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 01:43:37 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 01:43:37 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0009235 secs -(APIServer pid=1) INFO 04-22 01:43:37 [config/model.py:549] Resolved architecture: LlamaForCausalLM -(APIServer pid=1) INFO 04-22 01:43:37 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 01:43:37 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 01:43:37 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 01:43:37 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 01:43:37 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 01:43:37 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 01:43:37 [config/parallel.py:743] Defaulting to use mp for distributed inference -(APIServer pid=1) INFO 04-22 01:43:37 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 01:43:37 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 01:43:37 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 01:43:37 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:43:38 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:43:38 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 01:43:41 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:43:41 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:43:41 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:43:41 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:43:41 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:43:41 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:43:41 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:43:41 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:43:41 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:43:41 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:43:41 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:43:41 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:43:46 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=242) DEBUG 04-22 01:43:48 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 01:43:48 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=242) DEBUG 04-22 01:43:48 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/41703b62-cf91-44bb-8658-ca90b0b1dddd'], outputs=['ipc:///tmp/ded797e4-47d0-44f5-b379-ddd276a01a31'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=242) DEBUG 04-22 01:43:48 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=242) DEBUG 04-22 01:43:48 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=242) DEBUG 04-22 01:43:48 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=242) DEBUG 04-22 01:43:48 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=242) DEBUG 04-22 01:43:48 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=242) INFO 04-22 01:43:48 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=4, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=242) WARNING 04-22 01:43:48 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. -(EngineCore pid=242) INFO 04-22 01:43:48 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.130.2.185 (local), world_size=4, local_world_size=4 -(EngineCore pid=242) DEBUG 04-22 01:43:48 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/097fc3f9-a611-40fe-aaca-2a5bf508b979 -(EngineCore pid=242) DEBUG 04-22 01:43:48 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1, 2, 3], buffer_handle=(4, 16777216, 10, 'psm_ed275266'), local_subscribe_addr='ipc:///tmp/097fc3f9-a611-40fe-aaca-2a5bf508b979', local_notify_addr='ipc:///tmp/ec1e175c-82f2-4d1b-a146-12d710e40bc0', remote_subscribe_addr=None, remote_addr_ipv6=False) -DEBUG 04-22 01:43:51 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:43:51 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:43:51 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:43:51 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:43:51 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:43:51 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:43:51 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:43:51 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:43:51 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:43:51 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:43:51 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:43:51 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:43:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:43:51 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:43:51 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:43:51 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:43:51 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:43:51 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:43:51 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:43:51 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:43:51 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:43:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:43:51 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:43:51 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:43:51 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:43:51 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:43:51 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:43:51 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:43:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:43:51 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:43:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:43:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:43:51 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:43:51 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:43:51 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:43:51 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:43:51 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:43:51 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:43:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:43:51 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:43:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:43:51 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:43:51 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:43:51 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:43:51 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:43:51 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:43:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:43:51 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:43:56 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:43:57 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:43:57 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:43:57 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(APIServer pid=1) DEBUG 04-22 01:43:58 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -DEBUG 04-22 01:43:58 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:43:58 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:43:58 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:43:58 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 01:43:58 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:43:58 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:43:58 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:43:58 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 01:43:58 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:43:58 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:43:58 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:43:58 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 01:43:58 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:43:58 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:43:58 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:43:58 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(Worker pid=441) DEBUG 04-22 01:44:00 [distributed/parallel_state.py:1356] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:44231 backend=nccl -(Worker pid=441) INFO 04-22 01:44:00 [distributed/parallel_state.py:1400] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:44231 backend=nccl -(Worker pid=444) DEBUG 04-22 01:44:00 [distributed/parallel_state.py:1356] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:44231 backend=nccl -(Worker pid=444) INFO 04-22 01:44:00 [distributed/parallel_state.py:1400] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:44231 backend=nccl -(Worker pid=443) DEBUG 04-22 01:44:00 [distributed/parallel_state.py:1356] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:44231 backend=nccl -(Worker pid=443) INFO 04-22 01:44:00 [distributed/parallel_state.py:1400] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:44231 backend=nccl -(Worker pid=442) DEBUG 04-22 01:44:00 [distributed/parallel_state.py:1356] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:44231 backend=nccl -(Worker pid=442) INFO 04-22 01:44:00 [distributed/parallel_state.py:1400] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:44231 backend=nccl -[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -(Worker pid=442) DEBUG 04-22 01:44:00 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=443) DEBUG 04-22 01:44:00 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=441) DEBUG 04-22 01:44:00 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=444) DEBUG 04-22 01:44:00 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -(Worker pid=441) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=442) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=443) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=441) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=442) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=443) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=444) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=444) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=441) DEBUG 04-22 01:44:01 [utils/nccl.py:34] Found nccl from library libnccl.so.2 -(Worker pid=441) INFO 04-22 01:44:01 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(Worker pid=441) INFO 04-22 01:44:02 [distributed/parallel_state.py:1716] rank 0 in world size 4 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(Worker pid=443) DEBUG 04-22 01:44:02 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=77.71GiB, total_memory=79.19GiB, cuda_memory=1.48GiB, torch_memory=0.0GiB, non_torch_memory=1.48GiB, timestamp=1776822242.6008415, auto_measure=True -(Worker pid=443) DEBUG 04-22 01:44:02 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=441) DEBUG 04-22 01:44:02 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=77.71GiB, total_memory=79.19GiB, cuda_memory=1.48GiB, torch_memory=0.0GiB, non_torch_memory=1.48GiB, timestamp=1776822242.6116424, auto_measure=True -(Worker pid=444) DEBUG 04-22 01:44:02 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=77.71GiB, total_memory=79.19GiB, cuda_memory=1.48GiB, torch_memory=0.0GiB, non_torch_memory=1.48GiB, timestamp=1776822242.6117628, auto_measure=True -(Worker pid=441) DEBUG 04-22 01:44:02 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=444) DEBUG 04-22 01:44:02 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=443) DEBUG 04-22 01:44:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=443) DEBUG 04-22 01:44:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=444) DEBUG 04-22 01:44:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=441) DEBUG 04-22 01:44:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=444) DEBUG 04-22 01:44:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=441) DEBUG 04-22 01:44:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=443) DEBUG 04-22 01:44:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=444) DEBUG 04-22 01:44:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=441) DEBUG 04-22 01:44:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=444) DEBUG 04-22 01:44:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=441) DEBUG 04-22 01:44:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=442) DEBUG 04-22 01:44:02 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=77.71GiB, total_memory=79.19GiB, cuda_memory=1.48GiB, torch_memory=0.0GiB, non_torch_memory=1.48GiB, timestamp=1776822242.755553, auto_measure=True -(Worker pid=442) DEBUG 04-22 01:44:02 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=443) DEBUG 04-22 01:44:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=441) DEBUG 04-22 01:44:02 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(Worker pid=444) DEBUG 04-22 01:44:02 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=441) DEBUG 04-22 01:44:02 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=443) DEBUG 04-22 01:44:02 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=441) DEBUG 04-22 01:44:02 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(Worker_PP0 pid=441) INFO 04-22 01:44:02 [v1/worker/gpu_model_runner.py:4735] Starting to load model meta-llama/Llama-3.1-8B-Instruct... -(Worker pid=442) DEBUG 04-22 01:44:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=442) DEBUG 04-22 01:44:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=442) DEBUG 04-22 01:44:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=442) DEBUG 04-22 01:44:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=442) DEBUG 04-22 01:44:02 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker_PP0 pid=441) DEBUG 04-22 01:44:03 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(Worker_PP0 pid=441) INFO 04-22 01:44:03 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(Worker_PP0 pid=441) INFO 04-22 01:44:03 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(Worker_PP3 pid=444) DEBUG 04-22 01:44:03 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_PP2 pid=443) DEBUG 04-22 01:44:03 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_PP3 pid=444) DEBUG 04-22 01:44:03 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_PP3 pid=444) DEBUG 04-22 01:44:03 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 17, 'silu_and_mul': 8, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_PP3 pid=444) DEBUG 04-22 01:44:03 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_PP2 pid=443) DEBUG 04-22 01:44:03 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_PP2 pid=443) DEBUG 04-22 01:44:03 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 16, 'silu_and_mul': 8, 'rotary_embedding': 1, 'apply_rotary_emb': 1}) -(Worker_PP2 pid=443) DEBUG 04-22 01:44:03 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_PP0 pid=441) DEBUG 04-22 01:44:03 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_PP0 pid=441) DEBUG 04-22 01:44:03 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_PP0 pid=441) DEBUG 04-22 01:44:03 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 16, 'silu_and_mul': 8, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1}) -(Worker_PP0 pid=441) DEBUG 04-22 01:44:03 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_PP1 pid=442) DEBUG 04-22 01:44:03 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_PP1 pid=442) DEBUG 04-22 01:44:03 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_PP1 pid=442) DEBUG 04-22 01:44:03 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 16, 'silu_and_mul': 8, 'rotary_embedding': 1, 'apply_rotary_emb': 1}) -(Worker_PP1 pid=442) DEBUG 04-22 01:44:03 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_PP2 pid=443) DEBUG 04-22 01:44:03 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00001-of-00004.safetensors']] -(Worker_PP0 pid=441) DEBUG 04-22 01:44:03 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00001-of-00004.safetensors', 'model-00002-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00004-of-00004.safetensors']] -(Worker_PP3 pid=444) DEBUG 04-22 01:44:03 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00003-of-00004.safetensors']] -(Worker_PP1 pid=442) DEBUG 04-22 01:44:03 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00001-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00002-of-00004.safetensors']] -(Worker_PP0 pid=441) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 -(Worker_PP1 pid=442) DEBUG 04-22 01:44:06 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP2 pid=443) DEBUG 04-22 01:44:06 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP3 pid=444) DEBUG 04-22 01:44:06 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP1 pid=442) DEBUG 04-22 01:44:06 [compilation/decorators.py:528] Start compiling function -(Worker_PP2 pid=443) DEBUG 04-22 01:44:06 [compilation/decorators.py:528] Start compiling function -(Worker_PP3 pid=444) DEBUG 04-22 01:44:06 [compilation/decorators.py:528] Start compiling function -(EngineCore pid=242) DEBUG 04-22 01:44:07 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(APIServer pid=1) DEBUG 04-22 01:44:08 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/sequence.py -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/sequence.py -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/sequence.py -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/sequence.py -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=9f910dc055 comp=e546579c48 code=7f2f48347ccdde3394423985176b64bbf478231b0e1845a35438330041d319b6 dir=/data/.cache/vllm/torch_compile_cache/c5dad8de95/rank_2_0/backbone -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=cfaafe1362 comp=e546579c48 code=7f2f48347ccdde3394423985176b64bbf478231b0e1845a35438330041d319b6 dir=/data/.cache/vllm/torch_compile_cache/f350f8e302/rank_3_0/backbone -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_PP2 pid=443) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] Vllm config hash: 9f910dc055 -(Worker_PP0 pid=441) INFO 04-22 01:44:08 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/55a7723969/rank_0_0/backbone for vLLM's torch.compile -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=980a30b2da comp=e546579c48 code=7f24f63a7548e6fdf64e57e8c7d52119b84b1d43a0b72f3e4a2b71446193f817 dir=/data/.cache/vllm/torch_compile_cache/55a7723969/rank_0_0/backbone -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_PP3 pid=444) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] Vllm config hash: cfaafe1362 -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=9f910dc055 comp=e546579c48 code=7f2f48347ccdde3394423985176b64bbf478231b0e1845a35438330041d319b6 dir=/data/.cache/vllm/torch_compile_cache/c5dad8de95/rank_1_0/backbone -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] Vllm config hash: 980a30b2da -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_PP1 pid=442) DEBUG 04-22 01:44:08 [compilation/backends.py:1074] Vllm config hash: 9f910dc055 -(Worker_PP0 pid=441) INFO 04-22 01:44:08 [compilation/backends.py:1111] Dynamo bytecode transform time: 1.79 s -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(Worker_PP0 pid=441) DEBUG 04-22 01:44:08 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(Worker_PP2 pid=443) DEBUG 04-22 01:44:09 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_PP2 pid=443) DEBUG 04-22 01:44:09 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_PP2 pid=443) DEBUG 04-22 01:44:09 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_PP2 pid=443) DEBUG 04-22 01:44:09 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_PP2 pid=443) DEBUG 04-22 01:44:09 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_PP3 pid=444) DEBUG 04-22 01:44:09 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_PP3 pid=444) DEBUG 04-22 01:44:09 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_PP3 pid=444) DEBUG 04-22 01:44:09 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_PP3 pid=444) DEBUG 04-22 01:44:09 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_PP3 pid=444) DEBUG 04-22 01:44:09 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_PP1 pid=442) DEBUG 04-22 01:44:09 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_PP1 pid=442) DEBUG 04-22 01:44:09 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_PP1 pid=442) DEBUG 04-22 01:44:09 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_PP1 pid=442) DEBUG 04-22 01:44:09 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_PP1 pid=442) DEBUG 04-22 01:44:09 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_PP0 pid=441) DEBUG 04-22 01:44:09 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_PP0 pid=441) DEBUG 04-22 01:44:09 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_PP0 pid=441) DEBUG 04-22 01:44:09 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_PP0 pid=441) DEBUG 04-22 01:44:09 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_PP0 pid=441) DEBUG 04-22 01:44:09 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_PP0 pid=441) INFO 04-22 01:44:11 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(Worker_PP0 pid=441) DEBUG 04-22 01:44:11 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/55a7723969/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(Worker_PP0 pid=441) DEBUG 04-22 01:44:11 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_PP0 pid=441) DEBUG 04-22 01:44:11 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_PP0 pid=441) DEBUG 04-22 01:44:11 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_PP0 pid=441) DEBUG 04-22 01:44:11 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_PP0 pid=441) DEBUG 04-22 01:44:11 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_PP2 pid=443) DEBUG 04-22 01:44:11 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_PP2 pid=443) DEBUG 04-22 01:44:11 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_PP2 pid=443) DEBUG 04-22 01:44:11 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_PP2 pid=443) DEBUG 04-22 01:44:11 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_PP2 pid=443) DEBUG 04-22 01:44:11 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_PP3 pid=444) DEBUG 04-22 01:44:11 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_PP3 pid=444) DEBUG 04-22 01:44:11 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_PP3 pid=444) DEBUG 04-22 01:44:11 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms -(Worker_PP3 pid=444) DEBUG 04-22 01:44:11 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_PP3 pid=444) DEBUG 04-22 01:44:11 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_PP1 pid=442) DEBUG 04-22 01:44:11 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_PP1 pid=442) DEBUG 04-22 01:44:11 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms -(Worker_PP1 pid=442) DEBUG 04-22 01:44:11 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms -(Worker_PP1 pid=442) DEBUG 04-22 01:44:11 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_PP1 pid=442) DEBUG 04-22 01:44:11 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_PP0 pid=441) DEBUG 04-22 01:44:13 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/55a7723969/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(Worker_PP0 pid=441) DEBUG 04-22 01:44:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_PP0 pid=441) DEBUG 04-22 01:44:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_PP0 pid=441) DEBUG 04-22 01:44:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_PP0 pid=441) DEBUG 04-22 01:44:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_PP0 pid=441) DEBUG 04-22 01:44:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.6 ms -(Worker_PP2 pid=443) DEBUG 04-22 01:44:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_PP2 pid=443) DEBUG 04-22 01:44:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_PP2 pid=443) DEBUG 04-22 01:44:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_PP2 pid=443) DEBUG 04-22 01:44:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_PP2 pid=443) DEBUG 04-22 01:44:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_PP3 pid=444) DEBUG 04-22 01:44:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_PP3 pid=444) DEBUG 04-22 01:44:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_PP3 pid=444) DEBUG 04-22 01:44:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_PP3 pid=444) DEBUG 04-22 01:44:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_PP3 pid=444) DEBUG 04-22 01:44:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_PP1 pid=442) DEBUG 04-22 01:44:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_PP1 pid=442) DEBUG 04-22 01:44:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_PP1 pid=442) DEBUG 04-22 01:44:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_PP1 pid=442) DEBUG 04-22 01:44:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_PP1 pid=442) DEBUG 04-22 01:44:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_PP0 pid=441) DEBUG 04-22 01:44:13 [compilation/backends.py:377] Store the 8-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_8', '/data/.cache/vllm/torch_compile_cache/55a7723969/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_8') -(Worker_PP0 pid=441) INFO 04-22 01:44:13 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.28 s -(Worker_PP0 pid=441) DEBUG 04-22 01:44:13 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/55a7723969/rank_0_0/backbone/computation_graph.py -(Worker_PP0 pid=441) INFO 04-22 01:44:14 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/0ec2ebc3c5e681c25779ace37f1592abad189ac56456d07660699b9de0e0511a/rank_0_0/model -(Worker_PP0 pid=441) INFO 04-22 01:44:14 [compilation/monitor.py:48] torch.compile took 7.37 s in total -(Worker_PP0 pid=441) INFO 04-22 01:44:14 [compilation/monitor.py:76] Initial profiling/warmup run took 0.33 s -(Worker_PP2 pid=443) INFO 04-22 01:44:14 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_PP2 pid=443) DEBUG 04-22 01:44:14 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_PP2 pid=443) INFO 04-22 01:44:14 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_PP0 pid=441) INFO 04-22 01:44:14 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_PP0 pid=441) DEBUG 04-22 01:44:14 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_PP0 pid=441) INFO 04-22 01:44:14 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_PP2 pid=443) DEBUG 04-22 01:44:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP2 pid=443) DEBUG 04-22 01:44:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP2 pid=443) DEBUG 04-22 01:44:14 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_PP2 pid=443) DEBUG 04-22 01:44:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP2 pid=443) DEBUG 04-22 01:44:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP2 pid=443) DEBUG 04-22 01:44:14 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_PP2 pid=443) DEBUG 04-22 01:44:14 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 112.00 MiB first-capture + (51-1) × 1.00 MiB per-graph -(Worker_PP2 pid=443) DEBUG 04-22 01:44:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP2 pid=443) DEBUG 04-22 01:44:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP2 pid=443) DEBUG 04-22 01:44:14 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_PP2 pid=443) DEBUG 04-22 01:44:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP2 pid=443) DEBUG 04-22 01:44:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP2 pid=443) DEBUG 04-22 01:44:14 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_PP2 pid=443) DEBUG 04-22 01:44:14 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 256.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(Worker_PP1 pid=442) INFO 04-22 01:44:15 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_PP1 pid=442) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_PP1 pid=442) INFO 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 112.00 MiB first-capture + (51-1) × 1.00 MiB per-graph -(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 256.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(Worker_PP1 pid=442) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP1 pid=442) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP1 pid=442) DEBUG 04-22 01:44:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_PP1 pid=442) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP1 pid=442) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP1 pid=442) DEBUG 04-22 01:44:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_PP1 pid=442) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 112.00 MiB first-capture + (51-1) × 1.00 MiB per-graph -(Worker_PP1 pid=442) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP1 pid=442) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP1 pid=442) DEBUG 04-22 01:44:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_PP1 pid=442) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP1 pid=442) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP1 pid=442) DEBUG 04-22 01:44:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_PP1 pid=442) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 256.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(Worker_PP2 pid=443) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_PP2 pid=443) INFO 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.49 GiB total -(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_PP0 pid=441) INFO 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.49 GiB total -(Worker_PP2 pid=443) DEBUG 04-22 01:44:15 [v1/worker/gpu_worker.py:424] Initial free memory: 77.71 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_PP2 pid=443) DEBUG 04-22 01:44:15 [v1/worker/gpu_worker.py:430] Free memory after profiling: 74.07 GiB (total), 71.59 GiB (within requested) -(Worker_PP2 pid=443) DEBUG 04-22 01:44:15 [v1/worker/gpu_worker.py:435] Memory profiling takes 9.14 seconds. Total non KV cache memory: 4.4GiB; torch peak memory increase: 1.05GiB; non-torch forward increase memory: 0.07GiB; weights memory: 3.28GiB. -(Worker_PP2 pid=443) INFO 04-22 01:44:15 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9562 to maintain the same effective KV cache size. -(Worker_PP2 pid=443) DEBUG 04-22 01:44:15 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_PP1 pid=442) DEBUG 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_PP1 pid=442) INFO 04-22 01:44:15 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.49 GiB total -(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [v1/worker/gpu_worker.py:424] Initial free memory: 77.71 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [v1/worker/gpu_worker.py:430] Free memory after profiling: 73.22 GiB (total), 70.74 GiB (within requested) -(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [v1/worker/gpu_worker.py:435] Memory profiling takes 9.27 seconds. Total non KV cache memory: 5.43GiB; torch peak memory increase: 1.1GiB; non-torch forward increase memory: 0.07GiB; weights memory: 4.26GiB. -(Worker_PP0 pid=441) INFO 04-22 01:44:15 [v1/worker/gpu_worker.py:436] Available KV cache memory: 69.8 GiB -(Worker_PP0 pid=441) INFO 04-22 01:44:15 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9562 to maintain the same effective KV cache size. -(Worker_PP0 pid=441) DEBUG 04-22 01:44:15 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=242) DEBUG 04-22 01:44:15 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=242) DEBUG 04-22 01:44:15 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_PP1 pid=442) DEBUG 04-22 01:44:16 [v1/worker/gpu_worker.py:424] Initial free memory: 77.71 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_PP1 pid=442) DEBUG 04-22 01:44:16 [v1/worker/gpu_worker.py:430] Free memory after profiling: 74.07 GiB (total), 71.59 GiB (within requested) -(Worker_PP1 pid=442) DEBUG 04-22 01:44:16 [v1/worker/gpu_worker.py:435] Memory profiling takes 9.52 seconds. Total non KV cache memory: 4.4GiB; torch peak memory increase: 1.05GiB; non-torch forward increase memory: 0.07GiB; weights memory: 3.28GiB. -(Worker_PP1 pid=442) INFO 04-22 01:44:16 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9562 to maintain the same effective KV cache size. -(Worker_PP1 pid=442) DEBUG 04-22 01:44:16 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=242) DEBUG 04-22 01:44:16 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=242) DEBUG 04-22 01:44:16 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(APIServer pid=1) DEBUG 04-22 01:44:18 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_PP3 pid=444) INFO 04-22 01:44:20 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_PP3 pid=444) DEBUG 04-22 01:44:20 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_PP3 pid=444) INFO 04-22 01:44:20 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_PP3 pid=444) DEBUG 04-22 01:44:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP3 pid=444) DEBUG 04-22 01:44:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP3 pid=444) DEBUG 04-22 01:44:20 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_PP3 pid=444) DEBUG 04-22 01:44:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP3 pid=444) DEBUG 04-22 01:44:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP3 pid=444) DEBUG 04-22 01:44:20 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_PP3 pid=444) DEBUG 04-22 01:44:20 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 132.00 MiB first-capture + (51-1) × 1.00 MiB per-graph -(Worker_PP3 pid=444) DEBUG 04-22 01:44:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP3 pid=444) DEBUG 04-22 01:44:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP3 pid=444) DEBUG 04-22 01:44:20 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_PP3 pid=444) DEBUG 04-22 01:44:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP3 pid=444) DEBUG 04-22 01:44:20 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP3 pid=444) DEBUG 04-22 01:44:20 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_PP3 pid=444) DEBUG 04-22 01:44:20 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 256.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(Worker_PP3 pid=444) DEBUG 04-22 01:44:20 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_PP3 pid=444) INFO 04-22 01:44:20 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.49 GiB total -(Worker_PP3 pid=444) DEBUG 04-22 01:44:21 [v1/worker/gpu_worker.py:424] Initial free memory: 77.71 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_PP3 pid=444) DEBUG 04-22 01:44:21 [v1/worker/gpu_worker.py:430] Free memory after profiling: 73.0 GiB (total), 70.52 GiB (within requested) -(Worker_PP3 pid=444) DEBUG 04-22 01:44:21 [v1/worker/gpu_worker.py:435] Memory profiling takes 14.46 seconds. Total non KV cache memory: 6.34GiB; torch peak memory increase: 2.01GiB; non-torch forward increase memory: 0.07GiB; weights memory: 4.26GiB. -(Worker_PP3 pid=444) INFO 04-22 01:44:21 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9562 to maintain the same effective KV cache size. -(Worker_PP3 pid=444) DEBUG 04-22 01:44:21 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=242) DEBUG 04-22 01:44:21 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=242) INFO 04-22 01:44:21 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 2,257,264 tokens -(EngineCore pid=242) INFO 04-22 01:44:21 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 275.54x -(Worker_PP3 pid=444) DEBUG 04-22 01:44:21 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_PP0 pid=441) DEBUG 04-22 01:44:21 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_PP1 pid=442) DEBUG 04-22 01:44:21 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_PP2 pid=443) DEBUG 04-22 01:44:21 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=242) DEBUG 04-22 01:44:21 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=242) DEBUG 04-22 01:44:21 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_PP3 pid=444) 2026-04-22 01:44:21,255 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_PP2 pid=443) 2026-04-22 01:44:21,255 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_PP0 pid=441) 2026-04-22 01:44:21,255 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_PP1 pid=442) 2026-04-22 01:44:21,257 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_PP3 pid=444) DEBUG 04-22 01:44:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP2 pid=443) DEBUG 04-22 01:44:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP0 pid=441) DEBUG 04-22 01:44:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP1 pid=442) DEBUG 04-22 01:44:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP3 pid=444) 2026-04-22 01:44:21,264 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_PP2 pid=443) 2026-04-22 01:44:21,264 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_PP0 pid=441) 2026-04-22 01:44:21,264 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_PP1 pid=442) 2026-04-22 01:44:21,264 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_PP3 pid=444) DEBUG 04-22 01:44:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP3 pid=444) DEBUG 04-22 01:44:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP3 pid=444) DEBUG 04-22 01:44:21 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_PP3 pid=444) DEBUG 04-22 01:44:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP3 pid=444) DEBUG 04-22 01:44:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP3 pid=444) DEBUG 04-22 01:44:21 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_PP3 pid=444) DEBUG 04-22 01:44:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP2 pid=443) DEBUG 04-22 01:44:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP3 pid=444) DEBUG 04-22 01:44:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP3 pid=444) DEBUG 04-22 01:44:21 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=480, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_PP2 pid=443) DEBUG 04-22 01:44:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP2 pid=443) DEBUG 04-22 01:44:21 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_PP3 pid=444) DEBUG 04-22 01:44:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP2 pid=443) DEBUG 04-22 01:44:21 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_PP0 pid=441) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:35:30 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:35:30 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 01:35:30 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:35:30 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 01:35:30 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 01:35:30 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-3.1-8B-Instruct -(APIServer pid=1) INFO 04-22 01:35:30 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 01:35:30 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:35:30 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-3.1-8B-Instruct', 'model': 'meta-llama/Llama-3.1-8B-Instruct', 'max_model_len': 8192, 'tensor_parallel_size': 2, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 01:35:30 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 01:35:31 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 01:35:31 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0033037 secs -(APIServer pid=1) INFO 04-22 01:35:31 [config/model.py:549] Resolved architecture: LlamaForCausalLM -(APIServer pid=1) INFO 04-22 01:35:31 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 01:35:31 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 01:35:31 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 01:35:31 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 01:35:31 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 01:35:31 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 01:35:31 [config/parallel.py:743] Defaulting to use mp for distributed inference -(APIServer pid=1) INFO 04-22 01:35:31 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 01:35:31 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(APIServer pid=1) DEBUG 04-22 01:35:32 [config/vllm.py:1530] Max num batched tokens below allreduce-rms fusion threshold, allreduce-rms fusion will be enabled for all num_tokens. -(APIServer pid=1) INFO 04-22 01:35:32 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(APIServer pid=1) DEBUG 04-22 01:35:32 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 01:35:32 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:35:33 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:35:33 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 01:35:36 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:35:36 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:35:36 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:35:36 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:35:37 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:35:37 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:35:37 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:35:37 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:35:37 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:35:37 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:35:37 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:35:37 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:35:41 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(APIServer pid=1) DEBUG 04-22 01:35:43 [v1/engine/utils.py:1042] Waiting for 1 local, 0 remote core engine proc(s) to connect. -(EngineCore pid=245) DEBUG 04-22 01:35:43 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 01:35:43 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=245) DEBUG 04-22 01:35:43 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/0d7210ec-58b5-4ffb-9e6f-cde13e376fc7'], outputs=['ipc:///tmp/424bfdcd-ff34-43c0-b07a-8ca1f43e5154'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=245) DEBUG 04-22 01:35:43 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=245) DEBUG 04-22 01:35:43 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=245) DEBUG 04-22 01:35:43 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=245) DEBUG 04-22 01:35:43 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=245) DEBUG 04-22 01:35:43 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=245) INFO 04-22 01:35:43 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=245) WARNING 04-22 01:35:43 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. -(EngineCore pid=245) INFO 04-22 01:35:43 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.129.9.32 (local), world_size=2, local_world_size=2 -(EngineCore pid=245) DEBUG 04-22 01:35:43 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/4aa6a999-a067-4b8d-ae8d-8df7a86cf373 -(EngineCore pid=245) DEBUG 04-22 01:35:43 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1], buffer_handle=(2, 16777216, 10, 'psm_d30f3657'), local_subscribe_addr='ipc:///tmp/4aa6a999-a067-4b8d-ae8d-8df7a86cf373', local_notify_addr='ipc:///tmp/cdf14d75-6649-4e99-9342-5c86670188e6', remote_subscribe_addr=None, remote_addr_ipv6=False) -DEBUG 04-22 01:35:46 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:35:46 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:35:46 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:35:46 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:35:46 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:35:46 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:35:46 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:35:46 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:35:46 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:35:46 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:35:46 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:35:46 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:35:46 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:35:46 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:35:46 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:35:46 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:35:46 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:35:46 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:35:46 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:35:46 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:35:46 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:35:46 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:35:46 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:35:46 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:35:51 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:35:51 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:35:53 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:35:53 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:35:53 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:35:53 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 01:35:53 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:35:53 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:35:53 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:35:53 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) DEBUG 04-22 01:35:53 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker pid=444) DEBUG 04-22 01:35:53 [distributed/parallel_state.py:1356] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:44935 backend=nccl -(Worker pid=444) INFO 04-22 01:35:53 [distributed/parallel_state.py:1400] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:44935 backend=nccl -(Worker pid=445) DEBUG 04-22 01:35:53 [distributed/parallel_state.py:1356] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:44935 backend=nccl -(Worker pid=445) INFO 04-22 01:35:53 [distributed/parallel_state.py:1400] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:44935 backend=nccl -[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -(Worker pid=445) DEBUG 04-22 01:35:53 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=444) DEBUG 04-22 01:35:53 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -(Worker pid=444) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=444) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=445) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=445) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=444) DEBUG 04-22 01:35:54 [utils/nccl.py:34] Found nccl from library libnccl.so.2 -(Worker pid=444) INFO 04-22 01:35:54 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 -(Worker pid=445) DEBUG 04-22 01:35:55 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=444) DEBUG 04-22 01:35:55 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=444) DEBUG 04-22 01:35:55 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/adab702a-4289-4a96-8bc9-d0d6e0d605b8 -(Worker pid=444) DEBUG 04-22 01:35:55 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1], buffer_handle=(1, 4194304, 6, 'psm_29a8fa29'), local_subscribe_addr='ipc:///tmp/adab702a-4289-4a96-8bc9-d0d6e0d605b8', local_notify_addr='ipc:///tmp/1eb60031-4fb6-4f99-803a-b938389780d0', remote_subscribe_addr=None, remote_addr_ipv6=False) -(Worker pid=445) DEBUG 04-22 01:35:55 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/adab702a-4289-4a96-8bc9-d0d6e0d605b8 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(Worker pid=444) INFO 04-22 01:35:55 [distributed/parallel_state.py:1716] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(Worker pid=445) DEBUG 04-22 01:35:55 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.66GiB, total_memory=79.19GiB, cuda_memory=1.53GiB, torch_memory=0.02GiB, non_torch_memory=1.51GiB, timestamp=1776821755.3217318, auto_measure=True -(Worker pid=445) DEBUG 04-22 01:35:55 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=444) DEBUG 04-22 01:35:55 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.66GiB, total_memory=79.19GiB, cuda_memory=1.53GiB, torch_memory=0.02GiB, non_torch_memory=1.51GiB, timestamp=1776821755.353211, auto_measure=True -(Worker pid=444) DEBUG 04-22 01:35:55 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=445) DEBUG 04-22 01:35:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=445) DEBUG 04-22 01:35:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=445) DEBUG 04-22 01:35:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=444) DEBUG 04-22 01:35:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=444) DEBUG 04-22 01:35:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=445) DEBUG 04-22 01:35:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=444) DEBUG 04-22 01:35:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=445) DEBUG 04-22 01:35:55 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=444) DEBUG 04-22 01:35:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=444) DEBUG 04-22 01:35:55 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(Worker pid=444) DEBUG 04-22 01:35:55 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=444) DEBUG 04-22 01:35:55 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(Worker_TP0 pid=444) INFO 04-22 01:35:55 [v1/worker/gpu_model_runner.py:4735] Starting to load model meta-llama/Llama-3.1-8B-Instruct... -(Worker_TP0 pid=444) DEBUG 04-22 01:35:55 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(Worker_TP0 pid=444) INFO 04-22 01:35:55 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(Worker_TP0 pid=444) INFO 04-22 01:35:55 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(Worker_TP1 pid=445) DEBUG 04-22 01:35:56 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP1 pid=445) DEBUG 04-22 01:35:56 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP1 pid=445) DEBUG 04-22 01:35:56 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP1 pid=445) DEBUG 04-22 01:35:56 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP0 pid=444) DEBUG 04-22 01:35:56 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP0 pid=444) DEBUG 04-22 01:35:56 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP0 pid=444) DEBUG 04-22 01:35:56 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP0 pid=444) DEBUG 04-22 01:35:56 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP0 pid=444) DEBUG 04-22 01:35:56 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00003-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'model-00002-of-00004.safetensors']] -(Worker_TP0 pid=444) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 -(Worker_TP0 pid=444) DEBUG 04-22 01:36:07 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 01:36:07 [compilation/decorators.py:528] Start compiling function -(EngineCore pid=245) DEBUG 04-22 01:36:08 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=f16cc641b7 comp=e546579c48 code=d4902a9f99bbc2392c45e4b4dc801f4424a11306edb95a61851b1062db20b8c4 dir=/data/.cache/vllm/torch_compile_cache/c9dc6a35ed/rank_1_0/backbone -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP1 pid=445) DEBUG 04-22 01:36:11 [compilation/backends.py:1074] Vllm config hash: f16cc641b7 -(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_TP0 pid=444) DEBUG 04-22 01:36:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP0 pid=444) INFO 04-22 01:36:12 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/c9dc6a35ed/rank_0_0/backbone for vLLM's torch.compile -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=f16cc641b7 comp=e546579c48 code=d4902a9f99bbc2392c45e4b4dc801f4424a11306edb95a61851b1062db20b8c4 dir=/data/.cache/vllm/torch_compile_cache/c9dc6a35ed/rank_0_0/backbone -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/backends.py:1074] Vllm config hash: f16cc641b7 -(Worker_TP0 pid=444) INFO 04-22 01:36:12 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.63 s -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/.../fusion/allreduce_rms_fusion.py:765] Flashinfer max size: 64 MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: 8192 -(Worker_TP0 pid=444) INFO 04-22 01:36:12 [distributed/device_communicators/flashinfer_all_reduce.py:109] Auto-selected flashinfer allreduce backend: trtllm -(Worker_TP0 pid=444) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. -(Worker_TP0 pid=444) return func(*args, **kwargs) -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=0, max_token_num=8192, hidden_dim=4096, dtype=torch.bfloat16 -(Worker_TP1 pid=445) DEBUG 04-22 01:36:12 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=1, max_token_num=8192, hidden_dim=4096, dtype=torch.bfloat16 -(Worker_TP0 pid=444) INFO 04-22 01:36:12 [distributed/device_communicators/flashinfer_all_reduce.py:149] Initialized FlashInfer Allreduce norm fusion workspace with backend=trtllm -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(Worker_TP0 pid=444) DEBUG 04-22 01:36:12 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(Worker_TP1 pid=445) DEBUG 04-22 01:36:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=445) DEBUG 04-22 01:36:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:36:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=444) DEBUG 04-22 01:36:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:36:13 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns -(Worker_TP1 pid=445) DEBUG 04-22 01:36:13 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 25.5 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:36:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:36:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes -(Worker_TP1 pid=445) DEBUG 04-22 01:36:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:36:13 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns -(Worker_TP0 pid=444) DEBUG 04-22 01:36:13 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 27.3 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:36:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:36:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes -(Worker_TP0 pid=444) DEBUG 04-22 01:36:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(APIServer pid=1) DEBUG 04-22 01:36:13 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=444) INFO 04-22 01:36:15 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(Worker_TP0 pid=444) DEBUG 04-22 01:36:15 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/c9dc6a35ed/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(Worker_TP0 pid=444) DEBUG 04-22 01:36:16 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=444) DEBUG 04-22 01:36:16 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:36:16 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=445) DEBUG 04-22 01:36:16 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:36:16 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP0 pid=444) DEBUG 04-22 01:36:16 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 38.8 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:36:16 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:36:16 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP0 pid=444) DEBUG 04-22 01:36:16 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:36:16 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP1 pid=445) DEBUG 04-22 01:36:16 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 38.8 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:36:16 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:36:16 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP1 pid=445) DEBUG 04-22 01:36:16 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:36:16 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/c9dc6a35ed/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(Worker_TP1 pid=445) DEBUG 04-22 01:36:18 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=445) DEBUG 04-22 01:36:18 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:36:18 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=444) DEBUG 04-22 01:36:18 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:36:18 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP1 pid=445) DEBUG 04-22 01:36:18 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.8 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:36:18 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:36:18 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP1 pid=445) DEBUG 04-22 01:36:18 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:36:18 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP0 pid=444) DEBUG 04-22 01:36:18 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 42.6 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:36:18 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:36:18 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP0 pid=444) DEBUG 04-22 01:36:18 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:36:18 [compilation/backends.py:377] Store the 32-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_32', '/data/.cache/vllm/torch_compile_cache/c9dc6a35ed/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_32') -(Worker_TP0 pid=444) INFO 04-22 01:36:18 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.85 s -(Worker_TP0 pid=444) DEBUG 04-22 01:36:18 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/c9dc6a35ed/rank_0_0/backbone/computation_graph.py -(Worker_TP0 pid=444) INFO 04-22 01:36:19 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/837da8fa190b7b367e1a3f76b3995bb570fece3b66a8838c9778e41806ef4a88/rank_0_0/model -(Worker_TP0 pid=444) INFO 04-22 01:36:19 [compilation/monitor.py:48] torch.compile took 11.94 s in total -(Worker_TP0 pid=444) INFO 04-22 01:36:19 [compilation/monitor.py:76] Initial profiling/warmup run took 0.17 s -(APIServer pid=1) DEBUG 04-22 01:36:23 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP1 pid=445) INFO 04-22 01:36:25 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP1 pid=445) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP1 pid=445) INFO 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_TP0 pid=444) INFO 04-22 01:36:25 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP0 pid=444) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP0 pid=444) INFO 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_TP1 pid=445) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 01:36:25 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=444) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 01:36:25 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=445) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 01:36:25 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=444) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 01:36:25 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=445) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 96.00 MiB first-capture + (51-1) × 26.00 MiB per-graph -(Worker_TP1 pid=445) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 96.00 MiB first-capture + (51-1) × 26.00 MiB per-graph -(Worker_TP0 pid=444) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 01:36:25 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=444) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 01:36:25 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=445) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 01:36:25 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=444) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 01:36:25 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=445) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 132.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(Worker_TP1 pid=445) INFO 04-22 01:36:25 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses -(Worker_TP0 pid=444) DEBUG 04-22 01:36:25 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 132.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(Worker_TP0 pid=444) INFO 04-22 01:36:25 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses -(Worker_TP1 pid=445) DEBUG 04-22 01:36:26 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP1 pid=445) INFO 04-22 01:36:26 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.69 GiB total -(Worker_TP0 pid=444) DEBUG 04-22 01:36:26 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP0 pid=444) INFO 04-22 01:36:26 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.69 GiB total -(Worker_TP1 pid=445) DEBUG 04-22 01:36:26 [v1/worker/gpu_worker.py:424] Initial free memory: 77.66 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP1 pid=445) DEBUG 04-22 01:36:26 [v1/worker/gpu_worker.py:430] Free memory after profiling: 67.66 GiB (total), 65.23 GiB (within requested) -(Worker_TP1 pid=445) DEBUG 04-22 01:36:26 [v1/worker/gpu_worker.py:435] Memory profiling takes 19.45 seconds. Total non KV cache memory: 11.47GiB; torch peak memory increase: 1.89GiB; non-torch forward increase memory: 2.07GiB; weights memory: 7.51GiB. -(Worker_TP1 pid=445) INFO 04-22 01:36:26 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9714 to maintain the same effective KV cache size. -(Worker_TP1 pid=445) DEBUG 04-22 01:36:26 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=444) DEBUG 04-22 01:36:26 [v1/worker/gpu_worker.py:424] Initial free memory: 77.66 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP0 pid=444) DEBUG 04-22 01:36:26 [v1/worker/gpu_worker.py:430] Free memory after profiling: 67.66 GiB (total), 65.23 GiB (within requested) -(Worker_TP0 pid=444) DEBUG 04-22 01:36:26 [v1/worker/gpu_worker.py:435] Memory profiling takes 19.38 seconds. Total non KV cache memory: 11.47GiB; torch peak memory increase: 1.89GiB; non-torch forward increase memory: 2.07GiB; weights memory: 7.51GiB. -(Worker_TP0 pid=444) INFO 04-22 01:36:26 [v1/worker/gpu_worker.py:436] Available KV cache memory: 63.76 GiB -(Worker_TP0 pid=444) INFO 04-22 01:36:26 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9714 to maintain the same effective KV cache size. -(Worker_TP0 pid=444) DEBUG 04-22 01:36:26 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=245) DEBUG 04-22 01:36:26 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=245) INFO 04-22 01:36:26 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 1,044,688 tokens -(EngineCore pid=245) INFO 04-22 01:36:26 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 127.53x -(Worker_TP0 pid=444) DEBUG 04-22 01:36:26 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=445) DEBUG 04-22 01:36:26 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=444) 2026-04-22 01:36:26,785 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP1 pid=445) 2026-04-22 01:36:26,785 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP1 pid=445) DEBUG 04-22 01:36:26 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 01:36:26 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) 2026-04-22 01:36:26,799 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP0 pid=444) 2026-04-22 01:36:26,800 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP0 pid=444) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=245) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=245) DEBUG 04-22 01:36:32 [config/vllm.py:1530] Max num batched tokens below allreduce-rms fusion threshold, allreduce-rms fusion will be enabled for all num_tokens. -(EngineCore pid=245) INFO 04-22 01:36:32 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(Worker_TP0 pid=444) DEBUG 04-22 01:36:32 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=445) DEBUG 04-22 01:36:32 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=245) DEBUG 04-22 01:36:32 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=245) DEBUG 04-22 01:36:32 [v1/engine/core.py:1158] EngineCore waiting for work. -(EngineCore pid=245) DEBUG 04-22 01:36:32 [v1/engine/core.py:1158] EngineCore waiting for work. -(APIServer pid=1) INFO 04-22 01:36:32 [entrypoints/openai/api_server.py:590] Supported tasks: ['generate'] -(APIServer pid=1) WARNING 04-22 01:36:32 [config/model.py:1435] Default vLLM sampling parameters have been overridden by the model's `generation_config.json`: `{'temperature': 0.6, 'top_p': 0.9}`. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`. -(APIServer pid=1) DEBUG 04-22 01:36:33 [renderers/base.py:197] Warming up chat template processing... -(APIServer pid=1) DEBUG 04-22 01:36:33 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e82621-0d095a92633056173e92f6bd;3df6e5c7-c042-4e34-b6ad-faa54bd55108) -(APIServer pid=1) DEBUG 04-22 01:36:33 [transformers_utils/repo_utils.py:243] -(APIServer pid=1) DEBUG 04-22 01:36:33 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/resolve/main/processor_config.json. -(APIServer pid=1) DEBUG 04-22 01:36:33 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e82621-5f9cf38e349ad92500e3d5cf;75af7b6e-94c8-4a06-877a-3fd56f8c5edb) -(APIServer pid=1) DEBUG 04-22 01:36:33 [transformers_utils/repo_utils.py:243] -(APIServer pid=1) DEBUG 04-22 01:36:33 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/resolve/main/preprocessor_config.json. -(Worker_TP1 pid=445) DEBUG 04-22 01:36:33 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=444) DEBUG 04-22 01:36:33 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(APIServer pid=1) INFO 04-22 01:36:33 [renderers/hf.py:314] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this. -(APIServer pid=1) DEBUG 04-22 01:36:33 [renderers/base.py:203] Chat template warmup completed in 0.934s -(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/openai/api_server.py:594] Starting vLLM server on http://0.0.0.0:8000 -(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:37] Available routes are: -(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /openapi.json, Methods: HEAD, GET -(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /docs, Methods: HEAD, GET -(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /docs/oauth2-redirect, Methods: HEAD, GET -(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /redoc, Methods: HEAD, GET -(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /tokenize, Methods: POST -(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /detokenize, Methods: POST -(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /load, Methods: GET -(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /version, Methods: GET -(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /health, Methods: GET -(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /metrics, Methods: GET -(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /v1/models, Methods: GET -(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /ping, Methods: GET -(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /ping, Methods: POST -(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /invocations, Methods: POST -(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /v1/chat/completions, Methods: POST -(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST -(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /v1/responses, Methods: POST -(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET -(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST -(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /v1/completions, Methods: POST -(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /v1/messages, Methods: POST -(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST -(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /inference/v1/generate, Methods: POST -(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /scale_elastic_ep, Methods: POST -(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST -(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /v1/chat/completions/render, Methods: POST -(APIServer pid=1) INFO 04-22 01:36:34 [entrypoints/launcher.py:46] Route: /v1/completions/render, Methods: POST -(APIServer pid=1) INFO: Started server process [1] -(APIServer pid=1) INFO: Waiting for application startup. -(APIServer pid=1) INFO: Application startup complete. -(APIServer pid=1) DEBUG 04-22 01:36:38 [v1/engine/async_llm.py:875] Called check_health. -(APIServer pid=1) INFO: 10.129.8.2:49764 - "GET /health HTTP/1.1" 200 OK diff --git a/accuracy/results/v0.19.0/logs/meta-llama-llama-3-1-8b---h100-80gb--tp4pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/meta-llama-llama-3-1-8b---h100-80gb--tp4pp1dp1--8192.log deleted file mode 100644 index 545c678c..00000000 --- a/accuracy/results/v0.19.0/logs/meta-llama-llama-3-1-8b---h100-80gb--tp4pp1dp1--8192.log +++ /dev/null @@ -1,2768 +0,0 @@ -DEBUG 04-22 01:36:52 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:36:52 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:36:52 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:36:52 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:36:52 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:36:52 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:36:52 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:36:52 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:36:52 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:36:52 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:36:52 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:36:52 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:36:57 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:36:59 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' -DEBUG 04-22 01:36:59 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:36:59 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:36:59 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:36:59 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 01:36:59 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:36:59 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 01:36:59 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 01:36:59 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-3.1-8B-Instruct -(APIServer pid=1) INFO 04-22 01:36:59 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 01:36:59 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:36:59 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-3.1-8B-Instruct', 'model': 'meta-llama/Llama-3.1-8B-Instruct', 'max_model_len': 8192, 'tensor_parallel_size': 4, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 01:36:59 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 01:36:59 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 01:36:59 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0004390 secs -(APIServer pid=1) INFO 04-22 01:36:59 [config/model.py:549] Resolved architecture: LlamaForCausalLM -(APIServer pid=1) INFO 04-22 01:36:59 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 01:36:59 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 01:36:59 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 01:36:59 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 01:36:59 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 01:36:59 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 01:36:59 [config/parallel.py:743] Defaulting to use mp for distributed inference -(APIServer pid=1) INFO 04-22 01:36:59 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 01:36:59 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(APIServer pid=1) INFO 04-22 01:37:01 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(APIServer pid=1) DEBUG 04-22 01:37:01 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 01:37:01 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:37:02 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:37:02 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 01:37:05 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:37:05 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:37:05 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:37:05 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:37:05 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:37:05 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:37:05 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:37:05 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:37:05 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:37:05 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:37:05 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:37:05 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:37:10 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=244) DEBUG 04-22 01:37:11 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 01:37:11 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=244) DEBUG 04-22 01:37:11 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/47561f57-e3b4-4886-86f4-b8d671d8d58a'], outputs=['ipc:///tmp/f27c5513-b1bf-46b8-9e3d-d88f9838a989'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=244) DEBUG 04-22 01:37:11 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=244) DEBUG 04-22 01:37:11 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=244) DEBUG 04-22 01:37:11 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=244) DEBUG 04-22 01:37:11 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=244) DEBUG 04-22 01:37:11 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=244) INFO 04-22 01:37:11 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=4, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [256, 8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=244) WARNING 04-22 01:37:11 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. -(EngineCore pid=244) INFO 04-22 01:37:11 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.131.3.123 (local), world_size=4, local_world_size=4 -(EngineCore pid=244) DEBUG 04-22 01:37:11 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/cb2ab28f-0990-478b-8791-16db02f501c6 -(EngineCore pid=244) DEBUG 04-22 01:37:11 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1, 2, 3], buffer_handle=(4, 16777216, 10, 'psm_ba73b0bc'), local_subscribe_addr='ipc:///tmp/cb2ab28f-0990-478b-8791-16db02f501c6', local_notify_addr='ipc:///tmp/dd6de613-afe4-4559-8d5e-5ee043f1a2c4', remote_subscribe_addr=None, remote_addr_ipv6=False) -DEBUG 04-22 01:37:15 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:37:15 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:37:15 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:37:15 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:37:15 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:37:15 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:37:15 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:37:15 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:37:15 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:37:15 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:37:15 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:37:15 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:37:15 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:37:15 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:37:15 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:37:15 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:37:15 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:37:15 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:37:15 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:37:15 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:37:15 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:37:15 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:37:15 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:37:15 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:37:15 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:37:15 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:37:15 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:37:15 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:37:15 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:37:15 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:37:15 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:37:15 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:37:15 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:37:15 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:37:15 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:37:15 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:37:15 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:37:15 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:37:15 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:37:15 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:37:15 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:37:15 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:37:15 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:37:15 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:37:15 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:37:15 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:37:15 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:37:15 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:37:20 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:37:20 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:37:20 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:37:20 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:37:21 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:37:21 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:37:21 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:37:21 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 01:37:21 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:37:21 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:37:21 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:37:21 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) DEBUG 04-22 01:37:21 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -DEBUG 04-22 01:37:22 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:37:22 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:37:22 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:37:22 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 01:37:22 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:37:22 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:37:22 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:37:22 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(Worker pid=446) DEBUG 04-22 01:37:22 [distributed/parallel_state.py:1356] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:36977 backend=nccl -(Worker pid=446) INFO 04-22 01:37:22 [distributed/parallel_state.py:1400] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:36977 backend=nccl -(Worker pid=443) DEBUG 04-22 01:37:22 [distributed/parallel_state.py:1356] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:36977 backend=nccl -(Worker pid=443) INFO 04-22 01:37:22 [distributed/parallel_state.py:1400] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:36977 backend=nccl -(Worker pid=444) DEBUG 04-22 01:37:22 [distributed/parallel_state.py:1356] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:36977 backend=nccl -(Worker pid=444) INFO 04-22 01:37:22 [distributed/parallel_state.py:1400] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:36977 backend=nccl -(Worker pid=445) DEBUG 04-22 01:37:23 [distributed/parallel_state.py:1356] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:36977 backend=nccl -(Worker pid=445) INFO 04-22 01:37:23 [distributed/parallel_state.py:1400] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:36977 backend=nccl -[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -(Worker pid=443) DEBUG 04-22 01:37:23 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=445) DEBUG 04-22 01:37:23 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=444) DEBUG 04-22 01:37:23 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=446) DEBUG 04-22 01:37:23 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -(Worker pid=444) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=444) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=445) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=446) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=443) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=445) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=446) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=443) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=443) DEBUG 04-22 01:37:24 [utils/nccl.py:34] Found nccl from library libnccl.so.2 -(Worker pid=443) INFO 04-22 01:37:24 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 -(Worker pid=446) DEBUG 04-22 01:37:27 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=443) DEBUG 04-22 01:37:27 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=444) DEBUG 04-22 01:37:27 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=445) DEBUG 04-22 01:37:27 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=443) DEBUG 04-22 01:37:27 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/bae3bdc8-aa09-47ea-b15a-66559cba742a -(Worker pid=443) DEBUG 04-22 01:37:27 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1, 2, 3], buffer_handle=(3, 4194304, 6, 'psm_91c73355'), local_subscribe_addr='ipc:///tmp/bae3bdc8-aa09-47ea-b15a-66559cba742a', local_notify_addr='ipc:///tmp/cf6d7708-31bf-4b03-aabf-be42f5d638ec', remote_subscribe_addr=None, remote_addr_ipv6=False) -(Worker pid=444) DEBUG 04-22 01:37:27 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/bae3bdc8-aa09-47ea-b15a-66559cba742a -(Worker pid=446) DEBUG 04-22 01:37:27 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/bae3bdc8-aa09-47ea-b15a-66559cba742a -(Worker pid=445) DEBUG 04-22 01:37:27 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/bae3bdc8-aa09-47ea-b15a-66559cba742a -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(Worker pid=443) INFO 04-22 01:37:27 [distributed/parallel_state.py:1716] rank 0 in world size 4 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(Worker pid=443) DEBUG 04-22 01:37:28 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776821848.1358237, auto_measure=True -(Worker pid=443) DEBUG 04-22 01:37:28 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=445) DEBUG 04-22 01:37:28 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776821848.20137, auto_measure=True -(Worker pid=445) DEBUG 04-22 01:37:28 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=443) DEBUG 04-22 01:37:28 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=443) DEBUG 04-22 01:37:28 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=446) DEBUG 04-22 01:37:28 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776821848.2335331, auto_measure=True -(Worker pid=446) DEBUG 04-22 01:37:28 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=443) DEBUG 04-22 01:37:28 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=444) DEBUG 04-22 01:37:28 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776821848.248634, auto_measure=True -(Worker pid=444) DEBUG 04-22 01:37:28 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=443) DEBUG 04-22 01:37:28 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=443) DEBUG 04-22 01:37:28 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(Worker pid=443) DEBUG 04-22 01:37:28 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=445) DEBUG 04-22 01:37:28 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=445) DEBUG 04-22 01:37:28 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=445) DEBUG 04-22 01:37:28 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=443) DEBUG 04-22 01:37:28 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(Worker_TP0 pid=443) INFO 04-22 01:37:28 [v1/worker/gpu_model_runner.py:4735] Starting to load model meta-llama/Llama-3.1-8B-Instruct... -(Worker pid=446) DEBUG 04-22 01:37:28 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=446) DEBUG 04-22 01:37:28 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=445) DEBUG 04-22 01:37:28 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=446) DEBUG 04-22 01:37:28 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=444) DEBUG 04-22 01:37:28 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=444) DEBUG 04-22 01:37:28 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=445) DEBUG 04-22 01:37:28 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=444) DEBUG 04-22 01:37:28 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=446) DEBUG 04-22 01:37:28 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=446) DEBUG 04-22 01:37:28 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=444) DEBUG 04-22 01:37:28 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=444) DEBUG 04-22 01:37:28 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker_TP0 pid=443) DEBUG 04-22 01:37:28 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(Worker_TP0 pid=443) INFO 04-22 01:37:28 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(Worker_TP0 pid=443) INFO 04-22 01:37:28 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(Worker_TP0 pid=443) DEBUG 04-22 01:37:28 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP0 pid=443) DEBUG 04-22 01:37:28 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP0 pid=443) DEBUG 04-22 01:37:28 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP0 pid=443) DEBUG 04-22 01:37:28 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP2 pid=445) DEBUG 04-22 01:37:28 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP2 pid=445) DEBUG 04-22 01:37:28 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP2 pid=445) DEBUG 04-22 01:37:28 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP2 pid=445) DEBUG 04-22 01:37:28 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP3 pid=446) DEBUG 04-22 01:37:28 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP3 pid=446) DEBUG 04-22 01:37:28 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP3 pid=446) DEBUG 04-22 01:37:28 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP3 pid=446) DEBUG 04-22 01:37:28 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP1 pid=444) DEBUG 04-22 01:37:28 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP1 pid=444) DEBUG 04-22 01:37:28 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP1 pid=444) DEBUG 04-22 01:37:28 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP1 pid=444) DEBUG 04-22 01:37:28 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP0 pid=443) DEBUG 04-22 01:37:29 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00004-of-00004.safetensors']] -(Worker_TP2 pid=445) DEBUG 04-22 01:37:29 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00001-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00002-of-00004.safetensors']] -(Worker_TP0 pid=443) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 -(Worker_TP2 pid=445) DEBUG 04-22 01:37:39 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=445) DEBUG 04-22 01:37:39 [compilation/decorators.py:528] Start compiling function -(Worker_TP3 pid=446) DEBUG 04-22 01:37:39 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=444) DEBUG 04-22 01:37:39 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=446) DEBUG 04-22 01:37:39 [compilation/decorators.py:528] Start compiling function -(Worker_TP1 pid=444) DEBUG 04-22 01:37:39 [compilation/decorators.py:528] Start compiling function -(EngineCore pid=244) DEBUG 04-22 01:37:40 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(APIServer pid=1) DEBUG 04-22 01:37:42 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP0 pid=443) INFO 04-22 01:37:44 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/d42086f0a8/rank_0_0/backbone for vLLM's torch.compile -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=3f19a9810e comp=e546579c48 code=d4902a9f99bbc2392c45e4b4dc801f4424a11306edb95a61851b1062db20b8c4 dir=/data/.cache/vllm/torch_compile_cache/d42086f0a8/rank_0_0/backbone -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] Vllm config hash: 3f19a9810e -(Worker_TP0 pid=443) INFO 04-22 01:37:44 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.45 s -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/.../fusion/allreduce_rms_fusion.py:765] Flashinfer max size: 2 MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: 256 -(Worker_TP0 pid=443) INFO 04-22 01:37:44 [distributed/device_communicators/flashinfer_all_reduce.py:109] Auto-selected flashinfer allreduce backend: trtllm -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=3f19a9810e comp=e546579c48 code=d4902a9f99bbc2392c45e4b4dc801f4424a11306edb95a61851b1062db20b8c4 dir=/data/.cache/vllm/torch_compile_cache/d42086f0a8/rank_3_0/backbone -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] Vllm config hash: 3f19a9810e -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=3f19a9810e comp=e546579c48 code=d4902a9f99bbc2392c45e4b4dc801f4424a11306edb95a61851b1062db20b8c4 dir=/data/.cache/vllm/torch_compile_cache/d42086f0a8/rank_1_0/backbone -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=3f19a9810e comp=e546579c48 code=d4902a9f99bbc2392c45e4b4dc801f4424a11306edb95a61851b1062db20b8c4 dir=/data/.cache/vllm/torch_compile_cache/d42086f0a8/rank_2_0/backbone -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] Vllm config hash: 3f19a9810e -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [compilation/backends.py:1074] Vllm config hash: 3f19a9810e -(Worker_TP0 pid=443) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. -(Worker_TP0 pid=443) return func(*args, **kwargs) -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=0, max_token_num=256, hidden_dim=4096, dtype=torch.bfloat16 -(Worker_TP1 pid=444) DEBUG 04-22 01:37:44 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=1, max_token_num=256, hidden_dim=4096, dtype=torch.bfloat16 -(Worker_TP2 pid=445) DEBUG 04-22 01:37:44 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=2, max_token_num=256, hidden_dim=4096, dtype=torch.bfloat16 -(Worker_TP3 pid=446) DEBUG 04-22 01:37:44 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=3, max_token_num=256, hidden_dim=4096, dtype=torch.bfloat16 -(Worker_TP0 pid=443) INFO 04-22 01:37:44 [distributed/device_communicators/flashinfer_all_reduce.py:149] Initialized FlashInfer Allreduce norm fusion workspace with backend=trtllm -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 256), (257, 8192)] -(Worker_TP0 pid=443) DEBUG 04-22 01:37:44 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(Worker_TP0 pid=443) DEBUG 04-22 01:37:45 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=443) DEBUG 04-22 01:37:45 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:37:45 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=444) DEBUG 04-22 01:37:45 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP3 pid=446) DEBUG 04-22 01:37:45 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP3 pid=446) DEBUG 04-22 01:37:45 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:37:45 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns -(Worker_TP0 pid=443) DEBUG 04-22 01:37:45 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 24.0 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:37:45 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:37:45 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes -(Worker_TP2 pid=445) DEBUG 04-22 01:37:45 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=443) DEBUG 04-22 01:37:45 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP2 pid=445) DEBUG 04-22 01:37:45 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:37:45 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns -(Worker_TP1 pid=444) DEBUG 04-22 01:37:45 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 25.1 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:37:45 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:37:45 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes -(Worker_TP1 pid=444) DEBUG 04-22 01:37:45 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP3 pid=446) DEBUG 04-22 01:37:45 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns -(Worker_TP3 pid=446) DEBUG 04-22 01:37:45 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 25.2 ms -(Worker_TP3 pid=446) DEBUG 04-22 01:37:45 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP3 pid=446) DEBUG 04-22 01:37:45 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes -(Worker_TP3 pid=446) DEBUG 04-22 01:37:45 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP2 pid=445) DEBUG 04-22 01:37:45 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns -(Worker_TP2 pid=445) DEBUG 04-22 01:37:45 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 27.2 ms -(Worker_TP2 pid=445) DEBUG 04-22 01:37:45 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP2 pid=445) DEBUG 04-22 01:37:45 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes -(Worker_TP2 pid=445) DEBUG 04-22 01:37:45 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=443) INFO 04-22 01:37:47 [compilation/backends.py:372] Cache the graph of compile range (1, 256) for later use -(Worker_TP0 pid=443) DEBUG 04-22 01:37:47 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 256) from inductor_standalone via handle ('artifact_compile_range_1_256_subgraph_0', '/data/.cache/vllm/torch_compile_cache/d42086f0a8/rank_0_0/backbone/artifact_compile_range_1_256_subgraph_0') -(Worker_TP0 pid=443) DEBUG 04-22 01:37:48 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=443) DEBUG 04-22 01:37:48 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:37:48 [compilation/passes/pass_manager.py:100] Skipping with compile range (257, 8192) -(Worker_TP0 pid=443) DEBUG 04-22 01:37:48 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.6 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:37:48 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=443) DEBUG 04-22 01:37:48 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP3 pid=446) DEBUG 04-22 01:37:48 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP3 pid=446) DEBUG 04-22 01:37:48 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP3 pid=446) DEBUG 04-22 01:37:48 [compilation/passes/pass_manager.py:100] Skipping with compile range (257, 8192) -(Worker_TP3 pid=446) DEBUG 04-22 01:37:48 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP3 pid=446) DEBUG 04-22 01:37:48 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP3 pid=446) DEBUG 04-22 01:37:48 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:37:48 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=444) DEBUG 04-22 01:37:48 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:37:48 [compilation/passes/pass_manager.py:100] Skipping with compile range (257, 8192) -(Worker_TP1 pid=444) DEBUG 04-22 01:37:48 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:37:48 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=444) DEBUG 04-22 01:37:48 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP2 pid=445) DEBUG 04-22 01:37:48 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP2 pid=445) DEBUG 04-22 01:37:48 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP2 pid=445) DEBUG 04-22 01:37:48 [compilation/passes/pass_manager.py:100] Skipping with compile range (257, 8192) -(Worker_TP2 pid=445) DEBUG 04-22 01:37:48 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP2 pid=445) DEBUG 04-22 01:37:48 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP2 pid=445) DEBUG 04-22 01:37:48 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP0 pid=443) INFO 04-22 01:37:49 [compilation/backends.py:372] Cache the graph of compile range (257, 8192) for later use -(Worker_TP0 pid=443) DEBUG 04-22 01:37:49 [compilation/backends.py:377] Store the 0-th graph for compile range(257, 8192) from inductor_standalone via handle ('artifact_compile_range_257_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/d42086f0a8/rank_0_0/backbone/artifact_compile_range_257_8192_subgraph_0') -(Worker_TP0 pid=443) DEBUG 04-22 01:37:49 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=443) DEBUG 04-22 01:37:49 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.1 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:37:49 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP0 pid=443) DEBUG 04-22 01:37:49 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 36.5 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:37:49 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:37:49 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP0 pid=443) DEBUG 04-22 01:37:49 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:37:49 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=444) DEBUG 04-22 01:37:49 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP3 pid=446) DEBUG 04-22 01:37:49 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP3 pid=446) DEBUG 04-22 01:37:49 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:37:49 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP1 pid=444) DEBUG 04-22 01:37:49 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 37.2 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:37:49 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:37:49 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP1 pid=444) DEBUG 04-22 01:37:49 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP3 pid=446) DEBUG 04-22 01:37:49 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP3 pid=446) DEBUG 04-22 01:37:49 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 37.9 ms -(Worker_TP3 pid=446) DEBUG 04-22 01:37:49 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP3 pid=446) DEBUG 04-22 01:37:49 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP3 pid=446) DEBUG 04-22 01:37:49 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP2 pid=445) DEBUG 04-22 01:37:49 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP2 pid=445) DEBUG 04-22 01:37:49 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP2 pid=445) DEBUG 04-22 01:37:49 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP2 pid=445) DEBUG 04-22 01:37:49 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 38.7 ms -(Worker_TP2 pid=445) DEBUG 04-22 01:37:49 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP2 pid=445) DEBUG 04-22 01:37:49 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP2 pid=445) DEBUG 04-22 01:37:49 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:37:50 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 256) from inductor_standalone via handle ('artifact_compile_range_1_256_subgraph_1', '/data/.cache/vllm/torch_compile_cache/d42086f0a8/rank_0_0/backbone/artifact_compile_range_1_256_subgraph_1') -(Worker_TP0 pid=443) DEBUG 04-22 01:37:50 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=443) DEBUG 04-22 01:37:50 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:37:50 [compilation/passes/pass_manager.py:100] Skipping with compile range (257, 8192) -(Worker_TP0 pid=443) DEBUG 04-22 01:37:50 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:37:50 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=443) DEBUG 04-22 01:37:50 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP3 pid=446) DEBUG 04-22 01:37:50 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP3 pid=446) DEBUG 04-22 01:37:50 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP3 pid=446) DEBUG 04-22 01:37:50 [compilation/passes/pass_manager.py:100] Skipping with compile range (257, 8192) -(Worker_TP3 pid=446) DEBUG 04-22 01:37:50 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP3 pid=446) DEBUG 04-22 01:37:50 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP3 pid=446) DEBUG 04-22 01:37:50 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:37:50 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=444) DEBUG 04-22 01:37:50 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:37:50 [compilation/passes/pass_manager.py:100] Skipping with compile range (257, 8192) -(Worker_TP1 pid=444) DEBUG 04-22 01:37:50 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:37:50 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=444) DEBUG 04-22 01:37:50 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP2 pid=445) DEBUG 04-22 01:37:51 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP2 pid=445) DEBUG 04-22 01:37:51 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP2 pid=445) DEBUG 04-22 01:37:51 [compilation/passes/pass_manager.py:100] Skipping with compile range (257, 8192) -(Worker_TP2 pid=445) DEBUG 04-22 01:37:51 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP2 pid=445) DEBUG 04-22 01:37:51 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP2 pid=445) DEBUG 04-22 01:37:51 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(APIServer pid=1) DEBUG 04-22 01:37:52 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=443) DEBUG 04-22 01:37:52 [compilation/backends.py:377] Store the 1-th graph for compile range(257, 8192) from inductor_standalone via handle ('artifact_compile_range_257_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/d42086f0a8/rank_0_0/backbone/artifact_compile_range_257_8192_subgraph_1') -(Worker_TP0 pid=443) DEBUG 04-22 01:37:54 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=443) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:37:54 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP0 pid=443) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 37.5 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:37:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP0 pid=443) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:37:54 [compilation/backends.py:377] Store the 32-th graph for compile range(1, 256) from inductor_standalone via handle ('artifact_compile_range_1_256_subgraph_32', '/data/.cache/vllm/torch_compile_cache/d42086f0a8/rank_0_0/backbone/artifact_compile_range_1_256_subgraph_32') -(Worker_TP0 pid=443) INFO 04-22 01:37:54 [compilation/backends.py:390] Compiling a graph for compile range (1, 256) takes 6.41 s -(Worker_TP1 pid=444) DEBUG 04-22 01:37:54 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=444) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:37:54 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP1 pid=444) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 41.0 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:37:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP1 pid=444) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP3 pid=446) DEBUG 04-22 01:37:54 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP3 pid=446) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP3 pid=446) DEBUG 04-22 01:37:54 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP3 pid=446) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 40.4 ms -(Worker_TP3 pid=446) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP3 pid=446) DEBUG 04-22 01:37:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP3 pid=446) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:37:54 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=443) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:37:54 [compilation/passes/pass_manager.py:100] Skipping with compile range (257, 8192) -(Worker_TP0 pid=443) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:37:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=443) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(Worker_TP2 pid=445) DEBUG 04-22 01:37:54 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP2 pid=445) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP2 pid=445) DEBUG 04-22 01:37:54 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP2 pid=445) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 38.7 ms -(Worker_TP2 pid=445) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms -(Worker_TP2 pid=445) DEBUG 04-22 01:37:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP2 pid=445) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:37:54 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=444) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:37:54 [compilation/passes/pass_manager.py:100] Skipping with compile range (257, 8192) -(Worker_TP1 pid=444) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:37:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=444) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(Worker_TP3 pid=446) DEBUG 04-22 01:37:54 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP3 pid=446) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP3 pid=446) DEBUG 04-22 01:37:54 [compilation/passes/pass_manager.py:100] Skipping with compile range (257, 8192) -(Worker_TP3 pid=446) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP3 pid=446) DEBUG 04-22 01:37:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP3 pid=446) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(Worker_TP2 pid=445) DEBUG 04-22 01:37:54 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP2 pid=445) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(Worker_TP2 pid=445) DEBUG 04-22 01:37:54 [compilation/passes/pass_manager.py:100] Skipping with compile range (257, 8192) -(Worker_TP2 pid=445) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP2 pid=445) DEBUG 04-22 01:37:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP2 pid=445) DEBUG 04-22 01:37:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:37:54 [compilation/backends.py:377] Store the 32-th graph for compile range(257, 8192) from inductor_standalone via handle ('artifact_compile_range_257_8192_subgraph_32', '/data/.cache/vllm/torch_compile_cache/d42086f0a8/rank_0_0/backbone/artifact_compile_range_257_8192_subgraph_32') -(Worker_TP0 pid=443) INFO 04-22 01:37:54 [compilation/backends.py:390] Compiling a graph for compile range (257, 8192) takes 7.03 s -(Worker_TP0 pid=443) DEBUG 04-22 01:37:55 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/d42086f0a8/rank_0_0/backbone/computation_graph.py -(Worker_TP0 pid=443) INFO 04-22 01:37:56 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/cadcd07743782ae93f16f13d5b370fa694cecf280786fe0de34a40c069d9e40d/rank_0_0/model -(Worker_TP0 pid=443) INFO 04-22 01:37:56 [compilation/monitor.py:48] torch.compile took 16.38 s in total -(Worker_TP0 pid=443) INFO 04-22 01:37:56 [compilation/monitor.py:76] Initial profiling/warmup run took 0.71 s -(APIServer pid=1) DEBUG 04-22 01:38:02 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=443) INFO 04-22 01:38:02 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP0 pid=443) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP0 pid=443) INFO 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_TP1 pid=444) INFO 04-22 01:38:02 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP1 pid=444) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP1 pid=444) INFO 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_TP3 pid=446) INFO 04-22 01:38:02 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP3 pid=446) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP3 pid=446) INFO 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_TP2 pid=445) INFO 04-22 01:38:02 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP2 pid=445) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP2 pid=445) INFO 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_TP0 pid=443) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=443) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=443) DEBUG 04-22 01:38:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=443) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=443) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=443) DEBUG 04-22 01:38:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=443) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 84.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(Worker_TP0 pid=443) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=443) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=443) DEBUG 04-22 01:38:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=443) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=443) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=443) DEBUG 04-22 01:38:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=443) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 70.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(Worker_TP0 pid=443) INFO 04-22 01:38:02 [distributed/device_communicators/custom_all_reduce.py:216] Registering 260 cuda graph addresses -(Worker_TP1 pid=444) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=444) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=444) DEBUG 04-22 01:38:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=444) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=444) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=444) DEBUG 04-22 01:38:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP3 pid=446) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=444) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 84.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(Worker_TP1 pid=444) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=446) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=446) DEBUG 04-22 01:38:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=444) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=444) DEBUG 04-22 01:38:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=444) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=445) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=446) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=444) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=444) DEBUG 04-22 01:38:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP3 pid=446) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=446) DEBUG 04-22 01:38:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=444) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 70.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(Worker_TP1 pid=444) INFO 04-22 01:38:02 [distributed/device_communicators/custom_all_reduce.py:216] Registering 260 cuda graph addresses -(Worker_TP2 pid=445) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=445) DEBUG 04-22 01:38:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP3 pid=446) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 84.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(Worker_TP3 pid=446) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=446) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=446) DEBUG 04-22 01:38:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP2 pid=445) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=445) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=445) DEBUG 04-22 01:38:02 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP3 pid=446) DEBUG 04-22 01:38:02 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=446) DEBUG 04-22 01:38:03 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=446) DEBUG 04-22 01:38:03 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP2 pid=445) DEBUG 04-22 01:38:03 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 84.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(Worker_TP2 pid=445) DEBUG 04-22 01:38:03 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=446) DEBUG 04-22 01:38:03 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 70.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(Worker_TP3 pid=446) INFO 04-22 01:38:03 [distributed/device_communicators/custom_all_reduce.py:216] Registering 260 cuda graph addresses -(Worker_TP2 pid=445) DEBUG 04-22 01:38:03 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=445) DEBUG 04-22 01:38:03 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP2 pid=445) DEBUG 04-22 01:38:03 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=445) DEBUG 04-22 01:38:03 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=445) DEBUG 04-22 01:38:03 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP2 pid=445) DEBUG 04-22 01:38:03 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 70.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(Worker_TP2 pid=445) INFO 04-22 01:38:03 [distributed/device_communicators/custom_all_reduce.py:216] Registering 260 cuda graph addresses -(Worker_TP0 pid=443) DEBUG 04-22 01:38:03 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP0 pid=443) INFO 04-22 01:38:03 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.57 GiB total -(Worker_TP2 pid=445) DEBUG 04-22 01:38:03 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP2 pid=445) INFO 04-22 01:38:03 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.57 GiB total -(Worker_TP1 pid=444) DEBUG 04-22 01:38:03 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP1 pid=444) INFO 04-22 01:38:03 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.57 GiB total -(Worker_TP3 pid=446) DEBUG 04-22 01:38:03 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP3 pid=446) INFO 04-22 01:38:03 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.57 GiB total -(Worker_TP0 pid=443) DEBUG 04-22 01:38:03 [v1/worker/gpu_worker.py:424] Initial free memory: 77.64 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP0 pid=443) DEBUG 04-22 01:38:03 [v1/worker/gpu_worker.py:430] Free memory after profiling: 71.27 GiB (total), 68.87 GiB (within requested) -(Worker_TP0 pid=443) DEBUG 04-22 01:38:03 [v1/worker/gpu_worker.py:435] Memory profiling takes 23.99 seconds. Total non KV cache memory: 7.79GiB; torch peak memory increase: 1.89GiB; non-torch forward increase memory: 2.13GiB; weights memory: 3.77GiB. -(Worker_TP0 pid=443) INFO 04-22 01:38:03 [v1/worker/gpu_worker.py:436] Available KV cache memory: 67.44 GiB -(Worker_TP0 pid=443) INFO 04-22 01:38:03 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9572 to maintain the same effective KV cache size. -(Worker_TP0 pid=443) DEBUG 04-22 01:38:03 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=244) DEBUG 04-22 01:38:03 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=244) DEBUG 04-22 01:38:03 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP2 pid=445) DEBUG 04-22 01:38:04 [v1/worker/gpu_worker.py:424] Initial free memory: 77.64 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP2 pid=445) DEBUG 04-22 01:38:04 [v1/worker/gpu_worker.py:430] Free memory after profiling: 71.27 GiB (total), 68.87 GiB (within requested) -(Worker_TP2 pid=445) DEBUG 04-22 01:38:04 [v1/worker/gpu_worker.py:435] Memory profiling takes 24.21 seconds. Total non KV cache memory: 7.79GiB; torch peak memory increase: 1.89GiB; non-torch forward increase memory: 2.13GiB; weights memory: 3.77GiB. -(Worker_TP2 pid=445) INFO 04-22 01:38:04 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9572 to maintain the same effective KV cache size. -(Worker_TP2 pid=445) DEBUG 04-22 01:38:04 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=444) DEBUG 04-22 01:38:04 [v1/worker/gpu_worker.py:424] Initial free memory: 77.64 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP1 pid=444) DEBUG 04-22 01:38:04 [v1/worker/gpu_worker.py:430] Free memory after profiling: 71.27 GiB (total), 68.87 GiB (within requested) -(Worker_TP1 pid=444) DEBUG 04-22 01:38:04 [v1/worker/gpu_worker.py:435] Memory profiling takes 24.20 seconds. Total non KV cache memory: 7.79GiB; torch peak memory increase: 1.89GiB; non-torch forward increase memory: 2.13GiB; weights memory: 3.77GiB. -(Worker_TP1 pid=444) INFO 04-22 01:38:04 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9572 to maintain the same effective KV cache size. -(Worker_TP1 pid=444) DEBUG 04-22 01:38:04 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=244) DEBUG 04-22 01:38:04 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=244) DEBUG 04-22 01:38:04 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP3 pid=446) DEBUG 04-22 01:38:04 [v1/worker/gpu_worker.py:424] Initial free memory: 77.64 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP3 pid=446) DEBUG 04-22 01:38:04 [v1/worker/gpu_worker.py:430] Free memory after profiling: 71.27 GiB (total), 68.87 GiB (within requested) -(Worker_TP3 pid=446) DEBUG 04-22 01:38:04 [v1/worker/gpu_worker.py:435] Memory profiling takes 24.21 seconds. Total non KV cache memory: 7.79GiB; torch peak memory increase: 1.89GiB; non-torch forward increase memory: 2.13GiB; weights memory: 3.77GiB. -(Worker_TP3 pid=446) INFO 04-22 01:38:04 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9572 to maintain the same effective KV cache size. -(Worker_TP3 pid=446) DEBUG 04-22 01:38:04 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=244) DEBUG 04-22 01:38:04 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=244) INFO 04-22 01:38:04 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 2,209,792 tokens -(EngineCore pid=244) INFO 04-22 01:38:04 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 269.75x -(Worker_TP0 pid=443) DEBUG 04-22 01:38:04 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP2 pid=445) DEBUG 04-22 01:38:04 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=444) DEBUG 04-22 01:38:04 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP3 pid=446) DEBUG 04-22 01:38:04 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP3 pid=446) 2026-04-22 01:38:04,195 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP1 pid=444) 2026-04-22 01:38:04,195 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP2 pid=445) 2026-04-22 01:38:04,195 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP0 pid=443) 2026-04-22 01:38:04,196 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP1 pid=444) DEBUG 04-22 01:38:04 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=443) DEBUG 04-22 01:38:04 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=446) DEBUG 04-22 01:38:04 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=445) DEBUG 04-22 01:38:04 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=443) 2026-04-22 01:38:04,212 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP1 pid=444) 2026-04-22 01:38:04,212 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP3 pid=446) 2026-04-22 01:38:04,212 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP2 pid=445) 2026-04-22 01:38:04,213 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP0 pid=443) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=244) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=244) INFO 04-22 01:38:10 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(Worker_TP2 pid=445) DEBUG 04-22 01:38:10 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=444) DEBUG 04-22 01:38:10 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP3 pid=446) DEBUG 04-22 01:38:10 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=443) DEBUG 04-22 01:38:10 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=244) DEBUG 04-22 01:38:10 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=244) DEBUG 04-22 01:38:10 [v1/engine/core.py:1158] EngineCore waiting for work. -(EngineCore pid=244) DEBUG 04-22 01:38:10 [v1/engine/core.py:1158] EngineCore waiting for work. -(APIServer pid=1) INFO 04-22 01:38:10 [entrypoints/openai/api_server.py:590] Supported tasks: ['generate'] -(APIServer pid=1) WARNING 04-22 01:38:10 [config/model.py:1435] Default vLLM sampling parameters have been overridden by the model's `generation_config.json`: `{'temperature': 0.6, 'top_p': 0.9}`. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`. -(APIServer pid=1) DEBUG 04-22 01:38:10 [renderers/base.py:197] Warming up chat template processing... -(APIServer pid=1) DEBUG 04-22 01:38:10 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e82682-32adea411cb375e87c01dc55;358a11b2-1448-46f2-9841-f4af33f585c7) -(APIServer pid=1) DEBUG 04-22 01:38:10 [transformers_utils/repo_utils.py:243] -(APIServer pid=1) DEBUG 04-22 01:38:10 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/resolve/main/processor_config.json. -(APIServer pid=1) DEBUG 04-22 01:38:10 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e82682-417d35914b7603f52c092bee;5cc6a4a0-42a1-4a0f-ab93-1161ee564144) -(APIServer pid=1) DEBUG 04-22 01:38:10 [transformers_utils/repo_utils.py:243] -(APIServer pid=1) DEBUG 04-22 01:38:10 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/resolve/main/preprocessor_config.json. -(Worker_TP2 pid=445) DEBUG 04-22 01:38:11 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP3 pid=446) DEBUG 04-22 01:38:11 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=444) DEBUG 04-22 01:38:11 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=443) DEBUG 04-22 01:38:11 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(APIServer pid=1) INFO 04-22 01:38:11 [renderers/hf.py:314] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this. -(APIServer pid=1) DEBUG 04-22 01:38:11 [renderers/base.py:203] Chat template warmup completed in 0.842s -(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/openai/api_server.py:594] Starting vLLM server on http://0.0.0.0:8000 -(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:37] Available routes are: -(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /openapi.json, Methods: HEAD, GET -(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /docs, Methods: HEAD, GET -(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /docs/oauth2-redirect, Methods: HEAD, GET -(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /redoc, Methods: HEAD, GET -(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /tokenize, Methods: POST -(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /detokenize, Methods: POST -(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /load, Methods: GET -(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /version, Methods: GET -(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /health, Methods: GET -(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /metrics, Methods: GET -(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /v1/models, Methods: GET -(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /ping, Methods: GET -(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /ping, Methods: POST -(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /invocations, Methods: POST -(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /v1/chat/completions, Methods: POST -(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST -(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /v1/responses, Methods: POST -(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET -(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST -(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /v1/completions, Methods: POST -(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /v1/messages, Methods: POST -(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST -(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /inference/v1/generate, Methods: POST -(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /scale_elastic_ep, Methods: POST -(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST -(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /v1/chat/completions/render, Methods: POST -(APIServer pid=1) INFO 04-22 01:38:11 [entrypoints/launcher.py:46] Route: /v1/completions/render, Methods: POST -(APIServer pid=1) INFO: Started server process [1] -(APIServer pid=1) INFO: Waiting for application startup. -(APIServer pid=1) INFO: Application startup complete. -(APIServer pid=1) DEBUG 04-22 01:38:16 [v1/engine/async_llm.py:875] Called check_health. -(APIServer pid=1) INFO: 10.131.2.2:51242 - "GET /health HTTP/1.1" 200 OK diff --git a/accuracy/results/v0.19.0/logs/meta-llama-llama-3-1-8b--h100-80gb--tp1pp1dp1--16384.log b/accuracy/results/v0.19.0/logs/meta-llama-llama-3-1-8b--h100-80gb--tp1pp1dp1--16384.log deleted file mode 100644 index 8c91a116..00000000 --- a/accuracy/results/v0.19.0/logs/meta-llama-llama-3-1-8b--h100-80gb--tp1pp1dp1--16384.log +++ /dev/null @@ -1,746 +0,0 @@ -DEBUG 04-22 01:47:08 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:47:08 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:47:08 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:47:08 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:47:08 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:47:08 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:47:08 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:47:08 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:47:08 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:47:08 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:47:08 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:47:08 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:47:13 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:47:15 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' -DEBUG 04-22 01:47:15 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:47:15 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:47:15 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:47:15 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 01:47:15 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:47:15 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 01:47:15 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 01:47:15 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-3.1-8B-Instruct -(APIServer pid=1) INFO 04-22 01:47:15 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 01:47:15 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:47:15 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-3.1-8B-Instruct', 'model': 'meta-llama/Llama-3.1-8B-Instruct', 'max_model_len': 16384, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 01:47:15 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 01:47:15 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 01:47:15 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0005018 secs -(APIServer pid=1) INFO 04-22 01:47:15 [config/model.py:549] Resolved architecture: LlamaForCausalLM -(APIServer pid=1) INFO 04-22 01:47:15 [config/model.py:1678] Using max model len 16384 -(APIServer pid=1) DEBUG 04-22 01:47:15 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 01:47:15 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 01:47:15 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 01:47:15 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 01:47:15 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 01:47:15 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 01:47:15 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 01:47:15 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 01:47:15 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:47:16 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:47:16 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 01:47:19 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:47:19 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:47:19 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:47:19 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:47:19 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:47:19 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:47:19 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:47:19 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:47:19 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:47:19 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:47:19 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:47:19 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:47:24 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=243) DEBUG 04-22 01:47:25 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 01:47:25 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=243) DEBUG 04-22 01:47:25 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/354c5b62-d6ba-4a15-b1bf-57fa739977cb'], outputs=['ipc:///tmp/2cd607a0-3d07-47c0-8b87-18d629b2f761'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=243) DEBUG 04-22 01:47:25 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=243) DEBUG 04-22 01:47:25 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=243) DEBUG 04-22 01:47:25 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=243) DEBUG 04-22 01:47:25 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=243) DEBUG 04-22 01:47:25 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=243) INFO 04-22 01:47:25 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16384, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=243) DEBUG 04-22 01:47:26 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.37:35185 backend=nccl -(EngineCore pid=243) INFO 04-22 01:47:26 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.37:35185 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=243) DEBUG 04-22 01:47:26 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=243) INFO 04-22 01:47:26 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=243) DEBUG 04-22 01:47:26 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776822446.960836, auto_measure=True -(EngineCore pid=243) DEBUG 04-22 01:47:26 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=243) DEBUG 04-22 01:47:27 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=243) DEBUG 04-22 01:47:27 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=243) DEBUG 04-22 01:47:27 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=243) DEBUG 04-22 01:47:27 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=243) DEBUG 04-22 01:47:27 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=243) DEBUG 04-22 01:47:27 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=243) DEBUG 04-22 01:47:27 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=243) INFO 04-22 01:47:27 [v1/worker/gpu_model_runner.py:4735] Starting to load model meta-llama/Llama-3.1-8B-Instruct... -(EngineCore pid=243) DEBUG 04-22 01:47:27 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=243) INFO 04-22 01:47:27 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=243) INFO 04-22 01:47:27 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=243) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=243) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=243) DEBUG 04-22 01:47:27 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=243) DEBUG 04-22 01:47:27 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=243) DEBUG 04-22 01:47:27 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=243) DEBUG 04-22 01:47:27 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=243) DEBUG 04-22 01:47:28 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'model-00004-of-00004.safetensors']] -(EngineCore pid=243) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=243) INFO 04-22 01:47:42 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/ff8681c1a3/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=84e6e50ad7 comp=e546579c48 code=d4902a9f99bbc2392c45e4b4dc801f4424a11306edb95a61851b1062db20b8c4 dir=/data/.cache/vllm/torch_compile_cache/ff8681c1a3/rank_0_0/backbone -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/backends.py:1074] Vllm config hash: 84e6e50ad7 -(EngineCore pid=243) INFO 04-22 01:47:42 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.36 s -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.7 ms -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 01:47:42 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=243) INFO 04-22 01:47:44 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=243) DEBUG 04-22 01:47:44 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/ff8681c1a3/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=243) DEBUG 04-22 01:47:44 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 01:47:44 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(EngineCore pid=243) DEBUG 04-22 01:47:44 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.0 ms -(EngineCore pid=243) DEBUG 04-22 01:47:44 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 01:47:44 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(APIServer pid=1) DEBUG 04-22 01:47:45 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=243) DEBUG 04-22 01:47:46 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/ff8681c1a3/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=243) DEBUG 04-22 01:47:47 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 01:47:47 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms -(EngineCore pid=243) DEBUG 04-22 01:47:47 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms -(EngineCore pid=243) DEBUG 04-22 01:47:47 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 01:47:47 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(EngineCore pid=243) DEBUG 04-22 01:47:47 [compilation/backends.py:377] Store the 32-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_32', '/data/.cache/vllm/torch_compile_cache/ff8681c1a3/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_32') -(EngineCore pid=243) INFO 04-22 01:47:47 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.20 s -(EngineCore pid=243) DEBUG 04-22 01:47:47 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/ff8681c1a3/rank_0_0/backbone/computation_graph.py -(EngineCore pid=243) INFO 04-22 01:47:48 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/53c51fc11d01c95c773fe7cf86dfcdc35a297d8ecf40ebf20a05b5e48e2fba95/rank_0_0/model -(EngineCore pid=243) INFO 04-22 01:47:48 [compilation/monitor.py:48] torch.compile took 11.07 s in total -(EngineCore pid=243) INFO 04-22 01:47:49 [compilation/monitor.py:76] Initial profiling/warmup run took 0.45 s -(EngineCore pid=243) INFO 04-22 01:47:54 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=243) DEBUG 04-22 01:47:54 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=243) INFO 04-22 01:47:54 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=243) DEBUG 04-22 01:47:55 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:47:55 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:47:55 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 01:47:55 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:47:55 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:47:55 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 01:47:55 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 138.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=243) DEBUG 04-22 01:47:55 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:47:55 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:47:55 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 01:47:55 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:47:55 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:47:55 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 01:47:55 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 260.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=243) DEBUG 04-22 01:47:55 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=243) INFO 04-22 01:47:55 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.84 GiB total -(APIServer pid=1) DEBUG 04-22 01:47:55 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=243) DEBUG 04-22 01:47:55 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=243) DEBUG 04-22 01:47:55 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.24 GiB (total), 58.79 GiB (within requested) -(EngineCore pid=243) DEBUG 04-22 01:47:55 [v1/worker/gpu_worker.py:435] Memory profiling takes 18.17 seconds. Total non KV cache memory: 17.12GiB; torch peak memory increase: 1.89GiB; non-torch forward increase memory: 0.25GiB; weights memory: 14.99GiB. -(EngineCore pid=243) INFO 04-22 01:47:55 [v1/worker/gpu_worker.py:436] Available KV cache memory: 58.11 GiB -(EngineCore pid=243) INFO 04-22 01:47:55 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9606 to maintain the same effective KV cache size. -(EngineCore pid=243) INFO 04-22 01:47:55 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 476,016 tokens -(EngineCore pid=243) INFO 04-22 01:47:55 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 16,384 tokens per request: 29.05x -(EngineCore pid=243) 2026-04-22 01:47:55,992 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=243) DEBUG 04-22 01:47:55 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) 2026-04-22 01:47:56,001 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=243) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:48:15 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:48:15 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 01:48:15 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:48:15 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 01:48:15 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 01:48:15 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-3.1-8B-Instruct -(APIServer pid=1) INFO 04-22 01:48:15 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 01:48:15 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:48:15 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-3.1-8B-Instruct', 'model': 'meta-llama/Llama-3.1-8B-Instruct', 'max_model_len': 32768, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 01:48:15 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 01:48:15 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 01:48:15 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003805 secs -(APIServer pid=1) INFO 04-22 01:48:15 [config/model.py:549] Resolved architecture: LlamaForCausalLM -(APIServer pid=1) INFO 04-22 01:48:15 [config/model.py:1678] Using max model len 32768 -(APIServer pid=1) DEBUG 04-22 01:48:15 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 01:48:15 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 01:48:15 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 01:48:15 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 01:48:15 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 01:48:15 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 01:48:15 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 01:48:16 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 01:48:16 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:48:16 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:48:16 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 01:48:20 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:48:20 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:48:20 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:48:20 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:48:20 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:48:20 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:48:20 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:48:20 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:48:20 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:48:20 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:48:20 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:48:20 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:48:24 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=243) DEBUG 04-22 01:48:26 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 01:48:26 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=243) DEBUG 04-22 01:48:26 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/f4ac1597-2d59-40bb-8ed5-6684c2a3c2e8'], outputs=['ipc:///tmp/8aef749a-2647-48a4-b1b3-6af061f134a4'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=243) DEBUG 04-22 01:48:26 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=243) DEBUG 04-22 01:48:26 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=243) DEBUG 04-22 01:48:26 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=243) DEBUG 04-22 01:48:26 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=243) DEBUG 04-22 01:48:26 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=243) INFO 04-22 01:48:26 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-3.1-8B-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-3.1-8B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-3.1-8B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=243) DEBUG 04-22 01:48:27 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.38:51655 backend=nccl -(EngineCore pid=243) INFO 04-22 01:48:27 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.38:51655 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=243) DEBUG 04-22 01:48:27 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=243) INFO 04-22 01:48:27 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=243) DEBUG 04-22 01:48:27 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776822507.546496, auto_measure=True -(EngineCore pid=243) DEBUG 04-22 01:48:27 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=243) DEBUG 04-22 01:48:27 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=243) DEBUG 04-22 01:48:27 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=243) DEBUG 04-22 01:48:27 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=243) DEBUG 04-22 01:48:27 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=243) DEBUG 04-22 01:48:27 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=243) DEBUG 04-22 01:48:27 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=243) DEBUG 04-22 01:48:27 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=243) INFO 04-22 01:48:27 [v1/worker/gpu_model_runner.py:4735] Starting to load model meta-llama/Llama-3.1-8B-Instruct... -(EngineCore pid=243) DEBUG 04-22 01:48:28 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=243) INFO 04-22 01:48:28 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=243) INFO 04-22 01:48:28 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=243) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=243) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=243) DEBUG 04-22 01:48:28 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=243) DEBUG 04-22 01:48:28 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=243) DEBUG 04-22 01:48:28 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=243) DEBUG 04-22 01:48:28 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=243) DEBUG 04-22 01:48:28 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00004-of-00004.safetensors', 'model-00002-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'model-00003-of-00004.safetensors']] -(EngineCore pid=243) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 -(APIServer pid=1) DEBUG 04-22 01:48:36 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=243) INFO 04-22 01:48:40 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/0b188b6917/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=b9b4658ca5 comp=e546579c48 code=d4902a9f99bbc2392c45e4b4dc801f4424a11306edb95a61851b1062db20b8c4 dir=/data/.cache/vllm/torch_compile_cache/0b188b6917/rank_0_0/backbone -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/backends.py:1074] Vllm config hash: b9b4658ca5 -(EngineCore pid=243) INFO 04-22 01:48:40 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.57 s -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=243) DEBUG 04-22 01:48:40 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=243) DEBUG 04-22 01:48:41 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 01:48:41 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms -(EngineCore pid=243) DEBUG 04-22 01:48:41 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.7 ms -(EngineCore pid=243) DEBUG 04-22 01:48:41 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 01:48:41 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=243) INFO 04-22 01:48:42 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=243) DEBUG 04-22 01:48:42 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/0b188b6917/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=243) DEBUG 04-22 01:48:43 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 01:48:43 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(EngineCore pid=243) DEBUG 04-22 01:48:43 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms -(EngineCore pid=243) DEBUG 04-22 01:48:43 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 01:48:43 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=243) DEBUG 04-22 01:48:44 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/0b188b6917/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=243) DEBUG 04-22 01:48:45 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 01:48:45 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms -(EngineCore pid=243) DEBUG 04-22 01:48:45 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms -(EngineCore pid=243) DEBUG 04-22 01:48:45 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 01:48:45 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(EngineCore pid=243) DEBUG 04-22 01:48:45 [compilation/backends.py:377] Store the 32-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_32', '/data/.cache/vllm/torch_compile_cache/0b188b6917/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_32') -(EngineCore pid=243) INFO 04-22 01:48:45 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.13 s -(EngineCore pid=243) DEBUG 04-22 01:48:45 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/0b188b6917/rank_0_0/backbone/computation_graph.py -(APIServer pid=1) DEBUG 04-22 01:48:46 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=243) INFO 04-22 01:48:46 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/eb43777f3ad942d09a4a824317e86653cb53f8c77161fefd5eaae5f110b38a0a/rank_0_0/model -(EngineCore pid=243) INFO 04-22 01:48:46 [compilation/monitor.py:48] torch.compile took 11.04 s in total -(EngineCore pid=243) INFO 04-22 01:48:47 [compilation/monitor.py:76] Initial profiling/warmup run took 0.45 s -(EngineCore pid=243) INFO 04-22 01:48:52 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=243) DEBUG 04-22 01:48:52 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=243) INFO 04-22 01:48:52 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=243) DEBUG 04-22 01:48:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:48:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:48:53 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 01:48:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:48:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:48:53 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 01:48:53 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 138.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=243) DEBUG 04-22 01:48:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:48:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:48:53 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 01:48:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:48:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:48:53 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 01:48:53 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 260.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=243) DEBUG 04-22 01:48:53 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=243) INFO 04-22 01:48:53 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.84 GiB total -(EngineCore pid=243) DEBUG 04-22 01:48:53 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=243) DEBUG 04-22 01:48:53 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.24 GiB (total), 58.79 GiB (within requested) -(EngineCore pid=243) DEBUG 04-22 01:48:53 [v1/worker/gpu_worker.py:435] Memory profiling takes 17.98 seconds. Total non KV cache memory: 17.12GiB; torch peak memory increase: 1.89GiB; non-torch forward increase memory: 0.25GiB; weights memory: 14.99GiB. -(EngineCore pid=243) INFO 04-22 01:48:53 [v1/worker/gpu_worker.py:436] Available KV cache memory: 58.11 GiB -(EngineCore pid=243) INFO 04-22 01:48:53 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9606 to maintain the same effective KV cache size. -(EngineCore pid=243) INFO 04-22 01:48:53 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 476,016 tokens -(EngineCore pid=243) INFO 04-22 01:48:53 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 32,768 tokens per request: 14.53x -(EngineCore pid=243) 2026-04-22 01:48:53,875 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=243) DEBUG 04-22 01:48:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) 2026-04-22 01:48:53,884 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=243) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:29:59 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:29:59 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 01:29:59 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:29:59 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 01:29:59 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 01:29:59 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-4-Scout-17B-16E-Instruct -(APIServer pid=1) INFO 04-22 01:29:59 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 01:29:59 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:29:59 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-4-Scout-17B-16E-Instruct', 'model': 'meta-llama/Llama-4-Scout-17B-16E-Instruct', 'max_model_len': 8192, 'tensor_parallel_size': 4, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 01:29:59 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 01:30:00 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.mllama4.Llama4ForConditionalGeneration from cache -(APIServer pid=1) DEBUG 04-22 01:30:00 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0008244 secs -(APIServer pid=1) INFO 04-22 01:30:00 [config/model.py:549] Resolved architecture: Llama4ForConditionalGeneration -(APIServer pid=1) INFO 04-22 01:30:00 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 01:30:00 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 01:30:00 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 01:30:00 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 01:30:00 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 01:30:00 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 01:30:00 [config/parallel.py:743] Defaulting to use mp for distributed inference -(APIServer pid=1) INFO 04-22 01:30:00 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 01:30:00 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(APIServer pid=1) INFO 04-22 01:30:01 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(APIServer pid=1) DEBUG 04-22 01:30:01 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 01:30:01 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:30:02 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:30:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(APIServer pid=1) DEBUG 04-22 01:30:02 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(APIServer pid=1) DEBUG 04-22 01:30:03 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. -(APIServer pid=1) DEBUG 04-22 01:30:03 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 01:30:13 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:30:13 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:30:13 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:30:13 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:30:13 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:30:13 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:30:13 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:30:13 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:30:13 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:30:13 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:30:13 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:30:13 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:30:17 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=470) DEBUG 04-22 01:30:19 [v1/engine/core.py:1018] Waiting for init message from front-end. -(EngineCore pid=470) DEBUG 04-22 01:30:19 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/d183e7bb-6eef-4da4-943b-f32769998c39'], outputs=['ipc:///tmp/1364cfc6-7766-4264-bfe9-0dd5982225b8'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(APIServer pid=1) DEBUG 04-22 01:30:19 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=470) DEBUG 04-22 01:30:19 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=470) DEBUG 04-22 01:30:19 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=470) DEBUG 04-22 01:30:19 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=470) DEBUG 04-22 01:30:19 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=470) DEBUG 04-22 01:30:19 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=470) INFO 04-22 01:30:19 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-4-Scout-17B-16E-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-4-Scout-17B-16E-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=4, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-4-Scout-17B-16E-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [204, 8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=470) WARNING 04-22 01:30:19 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. -(EngineCore pid=470) INFO 04-22 01:30:19 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.131.7.87 (local), world_size=4, local_world_size=4 -(EngineCore pid=470) DEBUG 04-22 01:30:19 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/e7240d42-f49f-4d17-9d01-cc24e0453b78 -(EngineCore pid=470) DEBUG 04-22 01:30:19 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1, 2, 3], buffer_handle=(4, 16777216, 10, 'psm_2b2306a0'), local_subscribe_addr='ipc:///tmp/e7240d42-f49f-4d17-9d01-cc24e0453b78', local_notify_addr='ipc:///tmp/36af057c-4fa5-42de-9705-f0cd0cc565fb', remote_subscribe_addr=None, remote_addr_ipv6=False) -DEBUG 04-22 01:30:22 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:30:22 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:30:22 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:30:22 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:30:22 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:30:22 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:30:22 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:30:22 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:30:22 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:30:22 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:30:22 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:30:22 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:30:22 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:30:22 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:30:22 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:30:22 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:30:22 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:30:22 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:30:22 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:30:22 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:30:22 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:30:22 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:30:22 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:30:22 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:30:22 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:30:22 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:30:22 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:30:22 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:30:22 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:30:22 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:30:22 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:30:22 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:30:22 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:30:22 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:30:22 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:30:22 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:30:22 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:30:22 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:30:22 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:30:22 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:30:22 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:30:22 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:30:22 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:30:22 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:30:22 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:30:22 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:30:22 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:30:22 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:30:27 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:30:27 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:30:27 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:30:27 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:30:28 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:30:28 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:30:28 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:30:28 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 01:30:29 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:30:29 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:30:29 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:30:29 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 01:30:29 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:30:29 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:30:29 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:30:29 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 01:30:29 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:30:29 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:30:29 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:30:29 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) DEBUG 04-22 01:30:29 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -DEBUG 04-22 01:30:29 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -DEBUG 04-22 01:30:29 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -DEBUG 04-22 01:30:29 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -DEBUG 04-22 01:30:29 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -DEBUG 04-22 01:30:29 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -DEBUG 04-22 01:30:29 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -DEBUG 04-22 01:30:29 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -DEBUG 04-22 01:30:29 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -DEBUG 04-22 01:30:29 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -DEBUG 04-22 01:30:29 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -DEBUG 04-22 01:30:29 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -DEBUG 04-22 01:30:29 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(Worker pid=669) DEBUG 04-22 01:30:30 [distributed/parallel_state.py:1356] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:38185 backend=nccl -(Worker pid=669) INFO 04-22 01:30:30 [distributed/parallel_state.py:1400] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:38185 backend=nccl -(Worker pid=670) DEBUG 04-22 01:30:31 [distributed/parallel_state.py:1356] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:38185 backend=nccl -(Worker pid=670) INFO 04-22 01:30:31 [distributed/parallel_state.py:1400] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:38185 backend=nccl -(Worker pid=671) DEBUG 04-22 01:30:31 [distributed/parallel_state.py:1356] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:38185 backend=nccl -(Worker pid=671) INFO 04-22 01:30:31 [distributed/parallel_state.py:1400] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:38185 backend=nccl -(Worker pid=672) DEBUG 04-22 01:30:31 [distributed/parallel_state.py:1356] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:38185 backend=nccl -(Worker pid=672) INFO 04-22 01:30:31 [distributed/parallel_state.py:1400] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:38185 backend=nccl -[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -(Worker pid=670) DEBUG 04-22 01:30:31 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=671) DEBUG 04-22 01:30:31 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=672) DEBUG 04-22 01:30:31 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=669) DEBUG 04-22 01:30:31 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -(Worker pid=670) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=671) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=670) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=671) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=672) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=672) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=669) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=669) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=669) DEBUG 04-22 01:30:31 [utils/nccl.py:34] Found nccl from library libnccl.so.2 -(Worker pid=669) INFO 04-22 01:30:31 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 -(Worker pid=670) DEBUG 04-22 01:30:32 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=671) DEBUG 04-22 01:30:32 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=672) DEBUG 04-22 01:30:32 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=669) DEBUG 04-22 01:30:32 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=669) DEBUG 04-22 01:30:32 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/2470ef01-879c-4c68-8a4f-d117d276b4de -(Worker pid=669) DEBUG 04-22 01:30:32 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1, 2, 3], buffer_handle=(3, 4194304, 6, 'psm_62c7f776'), local_subscribe_addr='ipc:///tmp/2470ef01-879c-4c68-8a4f-d117d276b4de', local_notify_addr='ipc:///tmp/7101c424-5035-49eb-b6bb-ea7d4a2f6f23', remote_subscribe_addr=None, remote_addr_ipv6=False) -(Worker pid=671) DEBUG 04-22 01:30:32 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/2470ef01-879c-4c68-8a4f-d117d276b4de -(Worker pid=672) DEBUG 04-22 01:30:32 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/2470ef01-879c-4c68-8a4f-d117d276b4de -(Worker pid=670) DEBUG 04-22 01:30:32 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/2470ef01-879c-4c68-8a4f-d117d276b4de -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -(Worker pid=669) INFO 04-22 01:30:33 [distributed/parallel_state.py:1716] rank 0 in world size 4 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N/A -(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] WorkerProc failed to start. -(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] Traceback (most recent call last): -(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 826, in worker_main -(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] worker = WorkerProc(*args, **kwargs) -(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) -(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ -(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 605, in __init__ -(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] self.worker.init_device() -(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 312, in init_device -(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] self.worker.init_device() # type: ignore -(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) -(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ -(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 283, in init_device -(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] self.requested_memory = request_memory(init_snapshot, self.cache_config) -(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/utils.py", line 413, in request_memory -(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] raise ValueError( -(Worker pid=670) ERROR 04-22 01:30:33 [v1/executor/multiproc_executor.py:857] ValueError: Free memory on device cuda:1 (60.33/79.19 GiB) on startup is less than desired GPU memory utilization (0.95, 75.23 GiB). Decrease GPU memory utilization or reduce GPU memory used by other processes. -(EngineCore pid=470) DEBUG 04-22 01:30:33 [v1/executor/multiproc_executor.py:419] Worker Termination: allow workers to gracefully shutdown -(Worker pid=672) DEBUG 04-22 01:30:33 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.12GiB, total_memory=79.19GiB, cuda_memory=2.07GiB, torch_memory=0.02GiB, non_torch_memory=2.05GiB, timestamp=1776821433.6940672, auto_measure=True -(Worker pid=672) DEBUG 04-22 01:30:33 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=669) DEBUG 04-22 01:30:33 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.12GiB, total_memory=79.19GiB, cuda_memory=2.07GiB, torch_memory=0.02GiB, non_torch_memory=2.05GiB, timestamp=1776821433.7040389, auto_measure=True -(Worker pid=669) DEBUG 04-22 01:30:33 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=671) DEBUG 04-22 01:30:33 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.12GiB, total_memory=79.19GiB, cuda_memory=2.07GiB, torch_memory=0.02GiB, non_torch_memory=2.05GiB, timestamp=1776821433.7163646, auto_measure=True -(Worker pid=671) DEBUG 04-22 01:30:33 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=672) DEBUG 04-22 01:30:33 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=672) DEBUG 04-22 01:30:33 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=669) DEBUG 04-22 01:30:33 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=669) DEBUG 04-22 01:30:33 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=671) DEBUG 04-22 01:30:33 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=671) DEBUG 04-22 01:30:33 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=672) DEBUG 04-22 01:30:33 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=669) DEBUG 04-22 01:30:33 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=672) DEBUG 04-22 01:30:33 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=669) DEBUG 04-22 01:30:33 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(Worker pid=671) DEBUG 04-22 01:30:33 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=669) DEBUG 04-22 01:30:33 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=671) DEBUG 04-22 01:30:33 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=470) DEBUG 04-22 01:30:37 [v1/executor/multiproc_executor.py:424] Worker Termination: workers still running sending SIGTERM -(Worker pid=672) DEBUG 04-22 01:30:37 [v1/executor/multiproc_executor.py:794] WorkerProc handling signal 15, raising SystemExit -(Worker pid=672) Exception ignored in: TypeError("'str' object cannot be converted to 'AddedToken'") -(Worker pid=672) Traceback (most recent call last): -(Worker pid=672) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 797, in signal_handler -(Worker pid=672) raise SystemExit() -(Worker pid=672) SystemExit: -(Worker pid=669) DEBUG 04-22 01:30:37 [v1/executor/multiproc_executor.py:794] WorkerProc handling signal 15, raising SystemExit -(Worker pid=669) WARNING 04-22 01:30:37 [v1/executor/multiproc_executor.py:871] WorkerProc was terminated -(Worker pid=671) DEBUG 04-22 01:30:37 [v1/executor/multiproc_executor.py:794] WorkerProc handling signal 15, raising SystemExit -(Worker pid=671) WARNING 04-22 01:30:37 [v1/executor/multiproc_executor.py:871] WorkerProc was terminated -[rank0]:[W422 01:30:38.196588814 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) -(APIServer pid=1) DEBUG 04-22 01:30:39 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -[rank3]:[W422 01:30:39.076626420 TCPStore.cpp:125] [c10d] recvValue failed on SocketImpl(fd=71, addr=[localhost]:58688, remote=[localhost]:38185): Failed to recv, got 0 bytes. Connection was likely closed. Did the remote server shutdown or crash? -Exception raised from recvBytes at /pytorch/torch/csrc/distributed/c10d/Utils.hpp:682 (most recent call first): -frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string, std::allocator >) + 0x9d (0x7efc950ddfdd in /usr/local/lib/python3.12/dist-packages/torch/lib/libc10.so) -frame #1: + 0x6a3325d (0x7efbaf67d25d in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) -frame #2: c10d::TCPStore::check(std::vector, std::allocator >, std::allocator, std::allocator > > > const&) + 0x273 (0x7efbaf67b1f3 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) -frame #3: c10d::ProcessGroupNCCL::HeartbeatMonitor::runLoop() + 0x44c (0x7efb50df79cc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) -frame #4: + 0xdc253 (0x7efc564b0253 in /lib/x86_64-linux-gnu/libstdc++.so.6) -frame #5: + 0x94ac3 (0x7efc95e39ac3 in /lib/x86_64-linux-gnu/libc.so.6) -frame #6: clone + 0x44 (0x7efc95ecaa84 in /lib/x86_64-linux-gnu/libc.so.6) - -[rank3]:[W422 01:30:39.079729534 ProcessGroupNCCL.cpp:1802] [PG ID 0 PG GUID 0 Rank 3] Failed to check the "should dump" flag on TCPStore, (maybe TCPStore server has shut down too early), with error: Failed to recv, got 0 bytes. Connection was likely closed. Did the remote server shutdown or crash? -[rank3]:[W422 01:30:40.079908370 TCPStore.cpp:106] [c10d] sendBytes failed on SocketImpl(fd=71, addr=[localhost]:58688, remote=[localhost]:38185): Broken pipe -Exception raised from sendBytes at /pytorch/torch/csrc/distributed/c10d/Utils.hpp:653 (most recent call first): -frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string, std::allocator >) + 0x9d (0x7efc950ddfdd in /usr/local/lib/python3.12/dist-packages/torch/lib/libc10.so) -frame #1: + 0x6a326d1 (0x7efbaf67c6d1 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) -frame #2: c10d::TCPStore::check(std::vector, std::allocator >, std::allocator, std::allocator > > > const&) + 0x24d (0x7efbaf67b1cd in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) -frame #3: c10d::ProcessGroupNCCL::HeartbeatMonitor::runLoop() + 0x44c (0x7efb50df79cc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) -frame #4: + 0xdc253 (0x7efc564b0253 in /lib/x86_64-linux-gnu/libstdc++.so.6) -frame #5: + 0x94ac3 (0x7efc95e39ac3 in /lib/x86_64-linux-gnu/libc.so.6) -frame #6: clone + 0x44 (0x7efc95ecaa84 in /lib/x86_64-linux-gnu/libc.so.6) - -[rank3]:[W422 01:30:40.084561021 ProcessGroupNCCL.cpp:1802] [PG ID 0 PG GUID 0 Rank 3] Failed to check the "should dump" flag on TCPStore, (maybe TCPStore server has shut down too early), with error: Broken pipe -(Worker_TP3 pid=672) DEBUG 04-22 01:30:40 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP3 pid=672) DEBUG 04-22 01:30:40 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP3 pid=672) DEBUG 04-22 01:30:40 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 34}) -(Worker_TP3 pid=672) DEBUG 04-22 01:30:40 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 133, 'silu_and_mul': 48, 'fused_moe': 48, 'unquantized_fused_moe': 48, 'rotary_embedding': 2, 'apply_rotary_emb': 2, 'vocab_parallel_embedding': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP3 pid=672) DEBUG 04-22 01:30:40 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 34}) -(Worker_TP3 pid=672) DEBUG 04-22 01:30:40 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 133, 'silu_and_mul': 48, 'fused_moe': 48, 'unquantized_fused_moe': 48, 'rotary_embedding': 2, 'apply_rotary_emb': 2, 'vocab_parallel_embedding': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP3 pid=672) DEBUG 04-22 01:30:40 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP3 pid=672) DEBUG 04-22 01:30:41 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00050-of-00050.safetensors', 'model-00031-of-00050.safetensors', 'model-00029-of-00050.safetensors', 'model-00011-of-00050.safetensors', 'model-00048-of-00050.safetensors', 'model-00038-of-00050.safetensors', 'model-00010-of-00050.safetensors', 'model-00015-of-00050.safetensors', 'model-00045-of-00050.safetensors', 'model-00009-of-00050.safetensors', 'model-00016-of-00050.safetensors', 'model-00013-of-00050.safetensors', 'model-00025-of-00050.safetensors', 'model-00026-of-00050.safetensors', 'model-00018-of-00050.safetensors', 'model-00008-of-00050.safetensors', 'model-00037-of-00050.safetensors', 'model-00034-of-00050.safetensors', 'model-00005-of-00050.safetensors', 'model-00002-of-00050.safetensors', 'model-00006-of-00050.safetensors', 'model-00035-of-00050.safetensors', 'model-00003-of-00050.safetensors', 'model-00040-of-00050.safetensors', 'model-00012-of-00050.safetensors', 'model-00004-of-00050.safetensors', 'model-00020-of-00050.safetensors', 'model-00022-of-00050.safetensors', 'model-00047-of-00050.safetensors', 'model-00030-of-00050.safetensors', 'model-00039-of-00050.safetensors', 'model-00028-of-00050.safetensors', 'model-00032-of-00050.safetensors', 'model-00023-of-00050.safetensors', 'model-00041-of-00050.safetensors', 'model-00044-of-00050.safetensors', 'model-00017-of-00050.safetensors', 'model-00043-of-00050.safetensors', 'model-00049-of-00050.safetensors', 'model-00024-of-00050.safetensors', 'model-00046-of-00050.safetensors', 'model-00033-of-00050.safetensors', 'model-00019-of-00050.safetensors', 'model-00021-of-00050.safetensors', 'model-00007-of-00050.safetensors', 'model-00027-of-00050.safetensors', 'model-00036-of-00050.safetensors', 'model-00001-of-00050.safetensors', 'model-00042-of-00050.safetensors', 'model-00014-of-00050.safetensors']] -[rank3]:[W422 01:30:41.084720813 TCPStore.cpp:106] [c10d] sendBytes failed on SocketImpl(fd=71, addr=[localhost]:58688, remote=[localhost]:38185): Broken pipe -Exception raised from sendBytes at /pytorch/torch/csrc/distributed/c10d/Utils.hpp:653 (most recent call first): -frame #0: c10::Error::Error(c10::SourceLocation, std::__cxx11::basic_string, std::allocator >) + 0x9d (0x7efc950ddfdd in /usr/local/lib/python3.12/dist-packages/torch/lib/libc10.so) -frame #1: + 0x6a326d1 (0x7efbaf67c6d1 in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) -frame #2: c10d::TCPStore::check(std::vector, std::allocator >, std::allocator, std::allocator > > > const&) + 0x24d (0x7efbaf67b1cd in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cpu.so) -frame #3: c10d::ProcessGroupNCCL::HeartbeatMonitor::runLoop() + 0x44c (0x7efb50df79cc in /usr/local/lib/python3.12/dist-packages/torch/lib/libtorch_cuda.so) -frame #4: + 0xdc253 (0x7efc564b0253 in /lib/x86_64-linux-gnu/libstdc++.so.6) -frame #5: + 0x94ac3 (0x7efc95e39ac3 in /lib/x86_64-linux-gnu/libc.so.6) -frame #6: clone + 0x44 (0x7efc95ecaa84 in /lib/x86_64-linux-gnu/libc.so.6) - -[rank3]:[W422 01:30:41.087290159 ProcessGroupNCCL.cpp:1802] [PG ID 0 PG GUID 0 Rank 3] Failed to check the "should dump" flag on TCPStore, (maybe TCPStore server has shut down too early), with error: Broken pipe -(EngineCore pid=470) DEBUG 04-22 01:30:41 [v1/executor/multiproc_executor.py:429] Worker Termination: resorting to SIGKILL to take down workers -(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] EngineCore failed to start. -(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] Traceback (most recent call last): -(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1082, in run_engine_core -(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs) -(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] return func(*args, **kwargs) -(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^ -(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 848, in __init__ -(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] super().__init__( -(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 114, in __init__ -(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] self.model_executor = executor_class(vllm_config) -(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 101, in __init__ -(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] super().__init__(vllm_config) -(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] return func(*args, **kwargs) -(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^ -(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 103, in __init__ -(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] self._init_executor() -(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 190, in _init_executor -(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] self.workers = WorkerProc.wait_for_ready(unready_workers) -(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 736, in wait_for_ready -(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] raise e from None -(EngineCore pid=470) ERROR 04-22 01:30:41 [v1/engine/core.py:1108] Exception: WorkerProc initialization failed due to an exception in a background process. See stack trace for root cause. -(EngineCore pid=470) Process EngineCore: -(EngineCore pid=470) Traceback (most recent call last): -(EngineCore pid=470) File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap -(EngineCore pid=470) self.run() -(EngineCore pid=470) File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run -(EngineCore pid=470) self._target(*self._args, **self._kwargs) -(EngineCore pid=470) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1112, in run_engine_core -(EngineCore pid=470) raise e -(EngineCore pid=470) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1082, in run_engine_core -(EngineCore pid=470) engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs) -(EngineCore pid=470) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(EngineCore pid=470) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(EngineCore pid=470) return func(*args, **kwargs) -(EngineCore pid=470) ^^^^^^^^^^^^^^^^^^^^^ -(EngineCore pid=470) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 848, in __init__ -(EngineCore pid=470) super().__init__( -(EngineCore pid=470) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 114, in __init__ -(EngineCore pid=470) self.model_executor = executor_class(vllm_config) -(EngineCore pid=470) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(EngineCore pid=470) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 101, in __init__ -(EngineCore pid=470) super().__init__(vllm_config) -(EngineCore pid=470) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(EngineCore pid=470) return func(*args, **kwargs) -(EngineCore pid=470) ^^^^^^^^^^^^^^^^^^^^^ -(EngineCore pid=470) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 103, in __init__ -(EngineCore pid=470) self._init_executor() -(EngineCore pid=470) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 190, in _init_executor -(EngineCore pid=470) self.workers = WorkerProc.wait_for_ready(unready_workers) -(EngineCore pid=470) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(EngineCore pid=470) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 736, in wait_for_ready -(EngineCore pid=470) raise e from None -(EngineCore pid=470) Exception: WorkerProc initialization failed due to an exception in a background process. See stack trace for root cause. -(EngineCore pid=470) DEBUG 04-22 01:30:42 [v1/executor/multiproc_executor.py:438] Triggering shutdown of workers -(APIServer pid=1) Traceback (most recent call last): -(APIServer pid=1) File "/usr/local/bin/vllm", line 10, in -(APIServer pid=1) sys.exit(main()) -(APIServer pid=1) ^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main -(APIServer pid=1) args.dispatch_function(args) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd -(APIServer pid=1) uvloop.run(run_server(args)) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run -(APIServer pid=1) return __asyncio.run( -(APIServer pid=1) ^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run -(APIServer pid=1) return runner.run(main) -(APIServer pid=1) ^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run -(APIServer pid=1) return self._loop.run_until_complete(task) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper -(APIServer pid=1) return await main -(APIServer pid=1) ^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server -(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker -(APIServer pid=1) async with build_async_engine_client( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ -(APIServer pid=1) return await anext(self.gen) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client -(APIServer pid=1) async with build_async_engine_client_from_engine_args( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ -(APIServer pid=1) return await anext(self.gen) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 136, in build_async_engine_client_from_engine_args -(APIServer pid=1) async_llm = AsyncLLM.from_vllm_config( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 225, in from_vllm_config -(APIServer pid=1) return cls( -(APIServer pid=1) ^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 154, in __init__ -(APIServer pid=1) self.engine_core = EngineCoreClient.make_async_mp_client( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(APIServer pid=1) return func(*args, **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 130, in make_async_mp_client -(APIServer pid=1) return AsyncMPClient(*client_args) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(APIServer pid=1) return func(*args, **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 887, in __init__ -(APIServer pid=1) super().__init__( -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 535, in __init__ -(APIServer pid=1) with launch_core_engines( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 144, in __exit__ -(APIServer pid=1) next(self.gen) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 998, in launch_core_engines -(APIServer pid=1) wait_for_engine_startup( -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 1057, in wait_for_engine_startup -(APIServer pid=1) raise RuntimeError( -(APIServer pid=1) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} -/usr/lib/python3.12/multiprocessing/resource_tracker.py:279: UserWarning: resource_tracker: There appear to be 1 leaked semaphore objects to clean up at shutdown - warnings.warn('resource_tracker: There appear to be %d ' -/usr/lib/python3.12/multiprocessing/resource_tracker.py:279: UserWarning: resource_tracker: There appear to be 1 leaked shared_memory objects to clean up at shutdown - warnings.warn('resource_tracker: There appear to be %d ' diff --git a/accuracy/results/v0.19.0/logs/meta-llama-llama-4-scout--h100-80gb--tp4pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/meta-llama-llama-4-scout--h100-80gb--tp4pp1dp1--8192.log deleted file mode 100644 index bf98beb9..00000000 --- a/accuracy/results/v0.19.0/logs/meta-llama-llama-4-scout--h100-80gb--tp4pp1dp1--8192.log +++ /dev/null @@ -1,2342 +0,0 @@ -DEBUG 04-23 00:11:13 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-23 00:11:13 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-23 00:11:13 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-23 00:11:13 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-23 00:11:13 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-23 00:11:13 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-23 00:11:13 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-23 00:11:13 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-23 00:11:13 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-23 00:11:13 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-23 00:11:13 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-23 00:11:13 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-23 00:11:18 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-23 00:11:20 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' -DEBUG 04-23 00:11:20 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-23 00:11:20 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-23 00:11:20 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-23 00:11:20 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-23 00:11:20 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-23 00:11:20 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-23 00:11:20 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-23 00:11:20 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model meta-llama/Llama-4-Scout-17B-16E-Instruct -(APIServer pid=1) INFO 04-23 00:11:20 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-23 00:11:20 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-23 00:11:20 [entrypoints/utils.py:233] non-default args: {'model_tag': 'meta-llama/Llama-4-Scout-17B-16E-Instruct', 'model': 'meta-llama/Llama-4-Scout-17B-16E-Instruct', 'max_model_len': 8192, 'tensor_parallel_size': 4, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-23 00:11:20 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-23 00:11:20 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.mllama4.Llama4ForConditionalGeneration from cache -(APIServer pid=1) DEBUG 04-23 00:11:20 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0015249 secs -(APIServer pid=1) INFO 04-23 00:11:20 [config/model.py:549] Resolved architecture: Llama4ForConditionalGeneration -(APIServer pid=1) INFO 04-23 00:11:20 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-23 00:11:20 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-23 00:11:20 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-23 00:11:20 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-23 00:11:20 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-23 00:11:20 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-23 00:11:20 [config/parallel.py:743] Defaulting to use mp for distributed inference -(APIServer pid=1) INFO 04-23 00:11:20 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-23 00:11:20 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(APIServer pid=1) INFO 04-23 00:11:22 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(APIServer pid=1) DEBUG 04-23 00:11:22 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-23 00:11:22 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-23 00:11:23 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-23 00:11:23 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(APIServer pid=1) DEBUG 04-23 00:11:23 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(APIServer pid=1) DEBUG 04-23 00:11:24 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. -(APIServer pid=1) DEBUG 04-23 00:11:24 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-23 00:11:34 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-23 00:11:34 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-23 00:11:34 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-23 00:11:34 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-23 00:11:34 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-23 00:11:34 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-23 00:11:34 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-23 00:11:34 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-23 00:11:34 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-23 00:11:34 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-23 00:11:34 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-23 00:11:34 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-23 00:11:38 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=469) DEBUG 04-23 00:11:40 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-23 00:11:40 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=469) DEBUG 04-23 00:11:40 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/de95e44d-b5c4-4856-85f7-6227a379d9a2'], outputs=['ipc:///tmp/a92eb3b7-4c3e-4857-8820-5bf77663f228'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=469) DEBUG 04-23 00:11:40 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=469) DEBUG 04-23 00:11:40 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=469) DEBUG 04-23 00:11:40 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=469) DEBUG 04-23 00:11:40 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=469) DEBUG 04-23 00:11:40 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=469) INFO 04-23 00:11:40 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='meta-llama/Llama-4-Scout-17B-16E-Instruct', speculative_config=None, tokenizer='meta-llama/Llama-4-Scout-17B-16E-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=4, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=meta-llama/Llama-4-Scout-17B-16E-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [204, 8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=469) WARNING 04-23 00:11:40 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. -(EngineCore pid=469) INFO 04-23 00:11:40 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.128.11.101 (local), world_size=4, local_world_size=4 -(EngineCore pid=469) DEBUG 04-23 00:11:40 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/fe9552c7-f71c-49a7-a745-1a0d33ad6db5 -(EngineCore pid=469) DEBUG 04-23 00:11:40 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1, 2, 3], buffer_handle=(4, 16777216, 10, 'psm_4a7b0a34'), local_subscribe_addr='ipc:///tmp/fe9552c7-f71c-49a7-a745-1a0d33ad6db5', local_notify_addr='ipc:///tmp/e2411317-cee2-4e4e-8b72-6f10ce156f0f', remote_subscribe_addr=None, remote_addr_ipv6=False) -DEBUG 04-23 00:11:43 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-23 00:11:43 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-23 00:11:43 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-23 00:11:43 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-23 00:11:43 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-23 00:11:43 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-23 00:11:43 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-23 00:11:43 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-23 00:11:43 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-23 00:11:43 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-23 00:11:43 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-23 00:11:43 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-23 00:11:43 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-23 00:11:43 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-23 00:11:43 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-23 00:11:43 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-23 00:11:43 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-23 00:11:43 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-23 00:11:43 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-23 00:11:43 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-23 00:11:43 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-23 00:11:43 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-23 00:11:43 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-23 00:11:43 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-23 00:11:43 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-23 00:11:43 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-23 00:11:43 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-23 00:11:43 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-23 00:11:43 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-23 00:11:43 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-23 00:11:43 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-23 00:11:43 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-23 00:11:43 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-23 00:11:43 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-23 00:11:43 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-23 00:11:43 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-23 00:11:43 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-23 00:11:43 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-23 00:11:43 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-23 00:11:43 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-23 00:11:44 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-23 00:11:44 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-23 00:11:44 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-23 00:11:44 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-23 00:11:44 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-23 00:11:44 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-23 00:11:44 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-23 00:11:44 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-23 00:11:48 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-23 00:11:48 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-23 00:11:48 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-23 00:11:49 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-23 00:11:50 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-23 00:11:50 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-23 00:11:50 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-23 00:11:50 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-23 00:11:50 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-23 00:11:50 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-23 00:11:50 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-23 00:11:50 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-23 00:11:50 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-23 00:11:50 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-23 00:11:50 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-23 00:11:50 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) DEBUG 04-23 00:11:50 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -DEBUG 04-23 00:11:50 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -DEBUG 04-23 00:11:50 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -DEBUG 04-23 00:11:50 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -DEBUG 04-23 00:11:50 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -DEBUG 04-23 00:11:50 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-23 00:11:50 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-23 00:11:50 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-23 00:11:50 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-23 00:11:50 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -DEBUG 04-23 00:11:50 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -DEBUG 04-23 00:11:50 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -DEBUG 04-23 00:11:50 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -DEBUG 04-23 00:11:50 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -DEBUG 04-23 00:11:50 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -DEBUG 04-23 00:11:50 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -DEBUG 04-23 00:11:51 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(Worker pid=668) DEBUG 04-23 00:11:52 [distributed/parallel_state.py:1356] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:39093 backend=nccl -(Worker pid=668) INFO 04-23 00:11:52 [distributed/parallel_state.py:1400] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:39093 backend=nccl -(Worker pid=670) DEBUG 04-23 00:11:52 [distributed/parallel_state.py:1356] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:39093 backend=nccl -(Worker pid=670) INFO 04-23 00:11:52 [distributed/parallel_state.py:1400] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:39093 backend=nccl -(Worker pid=671) DEBUG 04-23 00:11:52 [distributed/parallel_state.py:1356] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:39093 backend=nccl -(Worker pid=671) INFO 04-23 00:11:52 [distributed/parallel_state.py:1400] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:39093 backend=nccl -(Worker pid=669) DEBUG 04-23 00:11:52 [distributed/parallel_state.py:1356] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:39093 backend=nccl -(Worker pid=669) INFO 04-23 00:11:52 [distributed/parallel_state.py:1400] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:39093 backend=nccl -[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -(Worker pid=669) DEBUG 04-23 00:11:52 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=668) DEBUG 04-23 00:11:52 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=670) DEBUG 04-23 00:11:52 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=671) DEBUG 04-23 00:11:52 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -(Worker pid=668) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=668) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=669) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=669) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=668) DEBUG 04-23 00:11:52 [utils/nccl.py:34] Found nccl from library libnccl.so.2 -(Worker pid=668) INFO 04-23 00:11:52 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 -(Worker pid=670) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=670) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=671) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=671) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=670) DEBUG 04-23 00:11:54 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=671) DEBUG 04-23 00:11:54 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=669) DEBUG 04-23 00:11:54 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=668) DEBUG 04-23 00:11:54 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=668) DEBUG 04-23 00:11:54 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/03183ada-28c6-4fcd-ad5b-2f4670d315d5 -(Worker pid=668) DEBUG 04-23 00:11:54 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1, 2, 3], buffer_handle=(3, 4194304, 6, 'psm_a0da278d'), local_subscribe_addr='ipc:///tmp/03183ada-28c6-4fcd-ad5b-2f4670d315d5', local_notify_addr='ipc:///tmp/7066347a-f7cf-46f0-98ac-1b6df273b035', remote_subscribe_addr=None, remote_addr_ipv6=False) -(Worker pid=670) DEBUG 04-23 00:11:54 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/03183ada-28c6-4fcd-ad5b-2f4670d315d5 -(Worker pid=669) DEBUG 04-23 00:11:54 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/03183ada-28c6-4fcd-ad5b-2f4670d315d5 -(Worker pid=671) DEBUG 04-23 00:11:54 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/03183ada-28c6-4fcd-ad5b-2f4670d315d5 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -(Worker pid=668) INFO 04-23 00:11:54 [distributed/parallel_state.py:1716] rank 0 in world size 4 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N/A -(Worker pid=669) DEBUG 04-23 00:11:55 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.12GiB, total_memory=79.19GiB, cuda_memory=2.07GiB, torch_memory=0.02GiB, non_torch_memory=2.05GiB, timestamp=1776903115.0043905, auto_measure=True -(Worker pid=669) DEBUG 04-23 00:11:55 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=670) DEBUG 04-23 00:11:55 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.12GiB, total_memory=79.19GiB, cuda_memory=2.07GiB, torch_memory=0.02GiB, non_torch_memory=2.05GiB, timestamp=1776903115.0138264, auto_measure=True -(Worker pid=670) DEBUG 04-23 00:11:55 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=668) DEBUG 04-23 00:11:55 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.12GiB, total_memory=79.19GiB, cuda_memory=2.07GiB, torch_memory=0.02GiB, non_torch_memory=2.05GiB, timestamp=1776903115.0193129, auto_measure=True -(Worker pid=668) DEBUG 04-23 00:11:55 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=671) DEBUG 04-23 00:11:55 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.12GiB, total_memory=79.19GiB, cuda_memory=2.07GiB, torch_memory=0.02GiB, non_torch_memory=2.05GiB, timestamp=1776903115.032611, auto_measure=True -(Worker pid=671) DEBUG 04-23 00:11:55 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=669) DEBUG 04-23 00:11:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=669) DEBUG 04-23 00:11:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=670) DEBUG 04-23 00:11:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=670) DEBUG 04-23 00:11:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=668) DEBUG 04-23 00:11:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=668) DEBUG 04-23 00:11:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=671) DEBUG 04-23 00:11:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=671) DEBUG 04-23 00:11:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=669) DEBUG 04-23 00:11:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=670) DEBUG 04-23 00:11:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=668) DEBUG 04-23 00:11:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=668) DEBUG 04-23 00:11:55 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(Worker pid=669) DEBUG 04-23 00:11:55 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=670) DEBUG 04-23 00:11:55 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=671) DEBUG 04-23 00:11:55 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=668) DEBUG 04-23 00:11:55 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=671) DEBUG 04-23 00:11:55 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(APIServer pid=1) DEBUG 04-23 00:12:00 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker pid=668) DEBUG 04-23 00:12:01 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(Worker_TP0 pid=668) INFO 04-23 00:12:01 [v1/worker/gpu_model_runner.py:4735] Starting to load model meta-llama/Llama-4-Scout-17B-16E-Instruct... -(Worker_TP0 pid=668) INFO 04-23 00:12:01 [platforms/cuda.py:390] Using backend AttentionBackendEnum.FLASH_ATTN for vit attention -(Worker_TP0 pid=668) INFO 04-23 00:12:01 [model_executor/.../attention/mm_encoder_attention.py:230] Using AttentionBackendEnum.FLASH_ATTN for MMEncoderAttention. -(Worker_TP0 pid=668) INFO 04-23 00:12:01 [config/vllm.py:790] Asynchronous scheduling is enabled. -(Worker_TP0 pid=668) INFO 04-23 00:12:01 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(Worker_TP0 pid=668) DEBUG 04-23 00:12:01 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(Worker_TP0 pid=668) INFO 04-23 00:12:01 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(Worker_TP0 pid=668) INFO 04-23 00:12:01 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(Worker_TP0 pid=668) INFO 04-23 00:12:01 [model_executor/.../oracle/unquantized.py:186] Using TRITON backend for Unquantized MoE -(Worker_TP0 pid=668) DEBUG 04-23 00:12:01 [model_executor/.../runner/default_moe_runner.py:240] Enabled separate cuda stream for MoE shared_experts -(Worker_TP0 pid=668) DEBUG 04-23 00:12:02 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP0 pid=668) DEBUG 04-23 00:12:02 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP0 pid=668) DEBUG 04-23 00:12:02 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 34}) -(Worker_TP0 pid=668) DEBUG 04-23 00:12:02 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 133, 'silu_and_mul': 48, 'fused_moe': 48, 'unquantized_fused_moe': 48, 'rotary_embedding': 2, 'apply_rotary_emb': 2, 'vocab_parallel_embedding': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP0 pid=668) DEBUG 04-23 00:12:02 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 34}) -(Worker_TP0 pid=668) DEBUG 04-23 00:12:02 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 133, 'silu_and_mul': 48, 'fused_moe': 48, 'unquantized_fused_moe': 48, 'rotary_embedding': 2, 'apply_rotary_emb': 2, 'vocab_parallel_embedding': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP0 pid=668) DEBUG 04-23 00:12:02 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP0 pid=668) DEBUG 04-23 00:12:02 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00047-of-00050.safetensors', 'model-00006-of-00050.safetensors', 'model-00002-of-00050.safetensors', 'model-00050-of-00050.safetensors', 'model-00001-of-00050.safetensors', 'model-00013-of-00050.safetensors', 'model-00036-of-00050.safetensors', 'model-00039-of-00050.safetensors', 'model-00015-of-00050.safetensors', 'model-00048-of-00050.safetensors', 'model-00020-of-00050.safetensors', 'model-00042-of-00050.safetensors', 'model-00004-of-00050.safetensors', 'model-00027-of-00050.safetensors', 'model-00041-of-00050.safetensors', 'model-00011-of-00050.safetensors', 'model-00032-of-00050.safetensors', 'model-00016-of-00050.safetensors', 'model-00018-of-00050.safetensors', 'model-00049-of-00050.safetensors', 'model-00028-of-00050.safetensors', 'model-00031-of-00050.safetensors', 'model-00033-of-00050.safetensors', 'model-00035-of-00050.safetensors', 'model-00017-of-00050.safetensors', 'model-00037-of-00050.safetensors', 'model-00045-of-00050.safetensors', 'model-00021-of-00050.safetensors', 'model-00026-of-00050.safetensors', 'model-00034-of-00050.safetensors', 'model-00008-of-00050.safetensors', 'model-00019-of-00050.safetensors', 'model-00029-of-00050.safetensors', 'model-00014-of-00050.safetensors', 'model-00040-of-00050.safetensors', 'model-00030-of-00050.safetensors', 'model-00024-of-00050.safetensors', 'model-00007-of-00050.safetensors', 'model-00025-of-00050.safetensors', 'model-00038-of-00050.safetensors', 'model-00043-of-00050.safetensors', 'model-00046-of-00050.safetensors', 'model-00005-of-00050.safetensors', 'model-00022-of-00050.safetensors', 'model-00044-of-00050.safetensors', 'model-00009-of-00050.safetensors', 'model-00012-of-00050.safetensors', 'model-00010-of-00050.safetensors', 'model-00023-of-00050.safetensors', 'model-00003-of-00050.safetensors']] -(Worker_TP2 pid=670) DEBUG 04-23 00:12:02 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP2 pid=670) DEBUG 04-23 00:12:02 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP2 pid=670) DEBUG 04-23 00:12:02 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 34}) -(Worker_TP2 pid=670) DEBUG 04-23 00:12:02 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 133, 'silu_and_mul': 48, 'fused_moe': 48, 'unquantized_fused_moe': 48, 'rotary_embedding': 2, 'apply_rotary_emb': 2, 'vocab_parallel_embedding': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP2 pid=670) DEBUG 04-23 00:12:02 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 34}) -(Worker_TP2 pid=670) DEBUG 04-23 00:12:02 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 133, 'silu_and_mul': 48, 'fused_moe': 48, 'unquantized_fused_moe': 48, 'rotary_embedding': 2, 'apply_rotary_emb': 2, 'vocab_parallel_embedding': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP2 pid=670) DEBUG 04-23 00:12:02 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP3 pid=671) DEBUG 04-23 00:12:02 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP3 pid=671) DEBUG 04-23 00:12:02 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP1 pid=669) DEBUG 04-23 00:12:02 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP1 pid=669) DEBUG 04-23 00:12:02 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP3 pid=671) DEBUG 04-23 00:12:02 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 34}) -(Worker_TP3 pid=671) DEBUG 04-23 00:12:02 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 133, 'silu_and_mul': 48, 'fused_moe': 48, 'unquantized_fused_moe': 48, 'rotary_embedding': 2, 'apply_rotary_emb': 2, 'vocab_parallel_embedding': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP1 pid=669) DEBUG 04-23 00:12:02 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 34}) -(Worker_TP1 pid=669) DEBUG 04-23 00:12:02 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 133, 'silu_and_mul': 48, 'fused_moe': 48, 'unquantized_fused_moe': 48, 'rotary_embedding': 2, 'apply_rotary_emb': 2, 'vocab_parallel_embedding': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP3 pid=671) DEBUG 04-23 00:12:02 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 34}) -(Worker_TP3 pid=671) DEBUG 04-23 00:12:02 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 133, 'silu_and_mul': 48, 'fused_moe': 48, 'unquantized_fused_moe': 48, 'rotary_embedding': 2, 'apply_rotary_emb': 2, 'vocab_parallel_embedding': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP3 pid=671) DEBUG 04-23 00:12:02 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP1 pid=669) DEBUG 04-23 00:12:02 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 34}) -(Worker_TP1 pid=669) DEBUG 04-23 00:12:02 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 133, 'silu_and_mul': 48, 'fused_moe': 48, 'unquantized_fused_moe': 48, 'rotary_embedding': 2, 'apply_rotary_emb': 2, 'vocab_parallel_embedding': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP1 pid=669) DEBUG 04-23 00:12:02 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP2 pid=670) DEBUG 04-23 00:12:02 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00045-of-00050.safetensors', 'model-00034-of-00050.safetensors', 'model-00043-of-00050.safetensors', 'model-00046-of-00050.safetensors', 'model-00008-of-00050.safetensors', 'model-00032-of-00050.safetensors', 'model-00040-of-00050.safetensors', 'model-00042-of-00050.safetensors', 'model-00022-of-00050.safetensors', 'model-00035-of-00050.safetensors', 'model-00037-of-00050.safetensors', 'model-00050-of-00050.safetensors', 'model-00001-of-00050.safetensors', 'model-00033-of-00050.safetensors', 'model-00007-of-00050.safetensors', 'model-00017-of-00050.safetensors', 'model-00024-of-00050.safetensors', 'model-00027-of-00050.safetensors', 'model-00039-of-00050.safetensors', 'model-00015-of-00050.safetensors', 'model-00002-of-00050.safetensors', 'model-00004-of-00050.safetensors', 'model-00025-of-00050.safetensors', 'model-00048-of-00050.safetensors', 'model-00023-of-00050.safetensors', 'model-00010-of-00050.safetensors', 'model-00028-of-00050.safetensors', 'model-00049-of-00050.safetensors', 'model-00018-of-00050.safetensors', 'model-00012-of-00050.safetensors', 'model-00029-of-00050.safetensors', 'model-00016-of-00050.safetensors', 'model-00044-of-00050.safetensors', 'model-00006-of-00050.safetensors', 'model-00038-of-00050.safetensors', 'model-00041-of-00050.safetensors', 'model-00005-of-00050.safetensors', 'model-00036-of-00050.safetensors', 'model-00026-of-00050.safetensors', 'model-00030-of-00050.safetensors', 'model-00031-of-00050.safetensors', 'model-00003-of-00050.safetensors', 'model-00021-of-00050.safetensors', 'model-00014-of-00050.safetensors', 'model-00019-of-00050.safetensors', 'model-00011-of-00050.safetensors', 'model-00020-of-00050.safetensors', 'model-00009-of-00050.safetensors', 'model-00013-of-00050.safetensors', 'model-00047-of-00050.safetensors']] -(Worker_TP3 pid=671) DEBUG 04-23 00:12:02 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00023-of-00050.safetensors', 'model-00006-of-00050.safetensors', 'model-00010-of-00050.safetensors', 'model-00021-of-00050.safetensors', 'model-00015-of-00050.safetensors', 'model-00001-of-00050.safetensors', 'model-00046-of-00050.safetensors', 'model-00017-of-00050.safetensors', 'model-00003-of-00050.safetensors', 'model-00034-of-00050.safetensors', 'model-00005-of-00050.safetensors', 'model-00048-of-00050.safetensors', 'model-00004-of-00050.safetensors', 'model-00008-of-00050.safetensors', 'model-00014-of-00050.safetensors', 'model-00043-of-00050.safetensors', 'model-00028-of-00050.safetensors', 'model-00012-of-00050.safetensors', 'model-00018-of-00050.safetensors', 'model-00031-of-00050.safetensors', 'model-00042-of-00050.safetensors', 'model-00047-of-00050.safetensors', 'model-00024-of-00050.safetensors', 'model-00030-of-00050.safetensors', 'model-00041-of-00050.safetensors', 'model-00035-of-00050.safetensors', 'model-00026-of-00050.safetensors', 'model-00009-of-00050.safetensors', 'model-00045-of-00050.safetensors', 'model-00007-of-00050.safetensors', 'model-00027-of-00050.safetensors', 'model-00016-of-00050.safetensors', 'model-00029-of-00050.safetensors', 'model-00033-of-00050.safetensors', 'model-00049-of-00050.safetensors', 'model-00020-of-00050.safetensors', 'model-00036-of-00050.safetensors', 'model-00019-of-00050.safetensors', 'model-00002-of-00050.safetensors', 'model-00039-of-00050.safetensors', 'model-00050-of-00050.safetensors', 'model-00011-of-00050.safetensors', 'model-00025-of-00050.safetensors', 'model-00022-of-00050.safetensors', 'model-00038-of-00050.safetensors', 'model-00013-of-00050.safetensors', 'model-00040-of-00050.safetensors', 'model-00032-of-00050.safetensors', 'model-00044-of-00050.safetensors', 'model-00037-of-00050.safetensors']] -(Worker_TP1 pid=669) DEBUG 04-23 00:12:02 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00001-of-00050.safetensors', 'model-00021-of-00050.safetensors', 'model-00046-of-00050.safetensors', 'model-00012-of-00050.safetensors', 'model-00004-of-00050.safetensors', 'model-00018-of-00050.safetensors', 'model-00030-of-00050.safetensors', 'model-00037-of-00050.safetensors', 'model-00017-of-00050.safetensors', 'model-00029-of-00050.safetensors', 'model-00032-of-00050.safetensors', 'model-00040-of-00050.safetensors', 'model-00007-of-00050.safetensors', 'model-00002-of-00050.safetensors', 'model-00015-of-00050.safetensors', 'model-00028-of-00050.safetensors', 'model-00026-of-00050.safetensors', 'model-00043-of-00050.safetensors', 'model-00013-of-00050.safetensors', 'model-00006-of-00050.safetensors', 'model-00024-of-00050.safetensors', 'model-00011-of-00050.safetensors', 'model-00020-of-00050.safetensors', 'model-00008-of-00050.safetensors', 'model-00031-of-00050.safetensors', 'model-00010-of-00050.safetensors', 'model-00049-of-00050.safetensors', 'model-00009-of-00050.safetensors', 'model-00022-of-00050.safetensors', 'model-00034-of-00050.safetensors', 'model-00048-of-00050.safetensors', 'model-00019-of-00050.safetensors', 'model-00025-of-00050.safetensors', 'model-00036-of-00050.safetensors', 'model-00041-of-00050.safetensors', 'model-00035-of-00050.safetensors', 'model-00005-of-00050.safetensors', 'model-00039-of-00050.safetensors', 'model-00045-of-00050.safetensors', 'model-00038-of-00050.safetensors', 'model-00047-of-00050.safetensors', 'model-00023-of-00050.safetensors', 'model-00016-of-00050.safetensors', 'model-00042-of-00050.safetensors', 'model-00044-of-00050.safetensors', 'model-00033-of-00050.safetensors', 'model-00014-of-00050.safetensors', 'model-00050-of-00050.safetensors', 'model-00003-of-00050.safetensors', 'model-00027-of-00050.safetensors']] -(APIServer pid=1) DEBUG 04-23 00:12:10 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:12:20 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:12:30 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:12:40 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:12:50 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:13:00 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:13:10 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:13:20 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:13:30 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:13:40 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:13:50 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:14:00 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:14:10 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:14:20 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:14:30 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:14:40 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:14:50 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:15:00 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:15:10 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:15:20 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:15:30 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:15:40 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:15:50 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:16:00 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:16:10 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:16:20 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:16:30 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:16:40 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:16:50 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:17:00 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:17:10 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:17:20 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:17:30 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:17:40 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:17:50 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:18:00 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:18:10 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:18:20 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:18:30 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:18:40 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:18:50 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:19:00 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:19:10 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:19:20 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:19:30 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:19:40 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:19:50 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:20:00 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:20:10 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 00:20:20 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=668) INFO 04-23 00:20:22 [model_executor/model_loader/weight_utils.py:581] Time spent downloading weights for meta-llama/Llama-4-Scout-17B-16E-Instruct: 500.340780 seconds -(Worker_TP0 pid=668) Loading safetensors checkpoint shards: 0% Completed | 0/50 [00:00 -(Worker_TP1 pid=669) DEBUG 04-23 00:27:37 [compilation/decorators.py:528] Start compiling function -(Worker_TP0 pid=668) DEBUG 04-23 00:27:37 [compilation/decorators.py:528] Start compiling function -(Worker_TP2 pid=670) DEBUG 04-23 00:27:37 [compilation/decorators.py:528] Start compiling function -(APIServer pid=1) DEBUG 04-23 00:27:41 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/forward_context.py -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/config.py -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/shared_fused_moe.py -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama4.py -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/forward_context.py -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/config.py -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/shared_fused_moe.py -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama4.py -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=3084af9694 comp=e546579c48 code=d8a8fd34540a53acd8b823bdc994eaf99e6b69c42a6dc1372178a61fa194a68c dir=/data/.cache/vllm/torch_compile_cache/a31c82151b/rank_1_0/backbone -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=3084af9694 comp=e546579c48 code=d8a8fd34540a53acd8b823bdc994eaf99e6b69c42a6dc1372178a61fa194a68c dir=/data/.cache/vllm/torch_compile_cache/a31c82151b/rank_3_0/backbone -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] Vllm config hash: 3084af9694 -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] Vllm config hash: 3084af9694 -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/forward_context.py -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/config.py -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/shared_fused_moe.py -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama4.py -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP0 pid=668) INFO 04-23 00:27:44 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/a31c82151b/rank_0_0/backbone for vLLM's torch.compile -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=3084af9694 comp=e546579c48 code=d8a8fd34540a53acd8b823bdc994eaf99e6b69c42a6dc1372178a61fa194a68c dir=/data/.cache/vllm/torch_compile_cache/a31c82151b/rank_0_0/backbone -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] Vllm config hash: 3084af9694 -(Worker_TP0 pid=668) INFO 04-23 00:27:44 [compilation/backends.py:1111] Dynamo bytecode transform time: 6.65 s -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/.../fusion/allreduce_rms_fusion.py:765] Flashinfer max size: 2 MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: 204 -(Worker_TP0 pid=668) INFO 04-23 00:27:44 [distributed/device_communicators/flashinfer_all_reduce.py:109] Auto-selected flashinfer allreduce backend: trtllm -(Worker_TP0 pid=668) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. -(Worker_TP0 pid=668) return func(*args, **kwargs) -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/forward_context.py -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/config.py -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/shared_fused_moe.py -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama4.py -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=3084af9694 comp=e546579c48 code=d8a8fd34540a53acd8b823bdc994eaf99e6b69c42a6dc1372178a61fa194a68c dir=/data/.cache/vllm/torch_compile_cache/a31c82151b/rank_2_0/backbone -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [compilation/backends.py:1074] Vllm config hash: 3084af9694 -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=0, max_token_num=204, hidden_dim=5120, dtype=torch.bfloat16 -(Worker_TP1 pid=669) DEBUG 04-23 00:27:44 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=1, max_token_num=204, hidden_dim=5120, dtype=torch.bfloat16 -(Worker_TP2 pid=670) DEBUG 04-23 00:27:44 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=2, max_token_num=204, hidden_dim=5120, dtype=torch.bfloat16 -(Worker_TP3 pid=671) DEBUG 04-23 00:27:44 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=3, max_token_num=204, hidden_dim=5120, dtype=torch.bfloat16 -(Worker_TP0 pid=668) INFO 04-23 00:27:44 [distributed/device_communicators/flashinfer_all_reduce.py:149] Initialized FlashInfer Allreduce norm fusion workspace with backend=trtllm -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 204), (205, 8192)] -(Worker_TP0 pid=668) DEBUG 04-23 00:27:44 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(Worker_TP3 pid=671) DEBUG 04-23 00:27:45 [compilation/.../utility/noop_elimination.py:105] Removed 2 no-op reshapes and slices -(Worker_TP3 pid=671) DEBUG 04-23 00:27:45 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP3 pid=671) DEBUG 04-23 00:27:45 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 0 patterns -(Worker_TP3 pid=671) DEBUG 04-23 00:27:45 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 1.8 ms -(Worker_TP3 pid=671) DEBUG 04-23 00:27:45 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP3 pid=671) DEBUG 04-23 00:27:45 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP3 pid=671) DEBUG 04-23 00:27:45 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:27:45 [compilation/.../utility/noop_elimination.py:105] Removed 2 no-op reshapes and slices -(Worker_TP0 pid=668) DEBUG 04-23 00:27:45 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:27:45 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 0 patterns -(Worker_TP0 pid=668) DEBUG 04-23 00:27:45 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 1.7 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:27:45 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:27:45 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=668) DEBUG 04-23 00:27:45 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(Worker_TP2 pid=670) DEBUG 04-23 00:27:45 [compilation/.../utility/noop_elimination.py:105] Removed 2 no-op reshapes and slices -(Worker_TP2 pid=670) DEBUG 04-23 00:27:45 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP2 pid=670) DEBUG 04-23 00:27:45 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 0 patterns -(Worker_TP2 pid=670) DEBUG 04-23 00:27:45 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 1.9 ms -(Worker_TP2 pid=670) DEBUG 04-23 00:27:45 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP2 pid=670) DEBUG 04-23 00:27:45 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP2 pid=670) DEBUG 04-23 00:27:45 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(Worker_TP1 pid=669) DEBUG 04-23 00:27:46 [compilation/.../utility/noop_elimination.py:105] Removed 2 no-op reshapes and slices -(Worker_TP1 pid=669) DEBUG 04-23 00:27:46 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP1 pid=669) DEBUG 04-23 00:27:46 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 0 patterns -(Worker_TP1 pid=669) DEBUG 04-23 00:27:46 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 2.0 ms -(Worker_TP1 pid=669) DEBUG 04-23 00:27:46 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP1 pid=669) DEBUG 04-23 00:27:46 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=669) DEBUG 04-23 00:27:46 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(APIServer pid=1) DEBUG 04-23 00:27:51 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=668) INFO 04-23 00:27:53 [compilation/backends.py:372] Cache the graph of compile range (1, 204) for later use -(Worker_TP0 pid=668) DEBUG 04-23 00:27:53 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 204) from inductor_standalone via handle ('artifact_compile_range_1_204_subgraph_0', '/data/.cache/vllm/torch_compile_cache/a31c82151b/rank_0_0/backbone/artifact_compile_range_1_204_subgraph_0') -(Worker_TP1 pid=669) DEBUG 04-23 00:27:53 [compilation/.../utility/noop_elimination.py:105] Removed 2 no-op reshapes and slices -(Worker_TP1 pid=669) DEBUG 04-23 00:27:53 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP1 pid=669) DEBUG 04-23 00:27:53 [compilation/passes/pass_manager.py:100] Skipping with compile range (205, 8192) -(Worker_TP1 pid=669) DEBUG 04-23 00:27:53 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=669) DEBUG 04-23 00:27:53 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=669) DEBUG 04-23 00:27:53 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(Worker_TP2 pid=670) DEBUG 04-23 00:27:53 [compilation/.../utility/noop_elimination.py:105] Removed 2 no-op reshapes and slices -(Worker_TP2 pid=670) DEBUG 04-23 00:27:53 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.1 ms -(Worker_TP2 pid=670) DEBUG 04-23 00:27:53 [compilation/passes/pass_manager.py:100] Skipping with compile range (205, 8192) -(Worker_TP2 pid=670) DEBUG 04-23 00:27:53 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP2 pid=670) DEBUG 04-23 00:27:53 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP2 pid=670) DEBUG 04-23 00:27:53 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(Worker_TP3 pid=671) DEBUG 04-23 00:27:53 [compilation/.../utility/noop_elimination.py:105] Removed 2 no-op reshapes and slices -(Worker_TP3 pid=671) DEBUG 04-23 00:27:53 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP3 pid=671) DEBUG 04-23 00:27:53 [compilation/passes/pass_manager.py:100] Skipping with compile range (205, 8192) -(Worker_TP3 pid=671) DEBUG 04-23 00:27:53 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP3 pid=671) DEBUG 04-23 00:27:53 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP3 pid=671) DEBUG 04-23 00:27:53 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:27:53 [compilation/.../utility/noop_elimination.py:105] Removed 2 no-op reshapes and slices -(Worker_TP0 pid=668) DEBUG 04-23 00:27:53 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:27:53 [compilation/passes/pass_manager.py:100] Skipping with compile range (205, 8192) -(Worker_TP0 pid=668) DEBUG 04-23 00:27:53 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:27:53 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=668) DEBUG 04-23 00:27:53 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(Worker_TP0 pid=668) INFO 04-23 00:27:54 [compilation/backends.py:372] Cache the graph of compile range (205, 8192) for later use -(Worker_TP0 pid=668) DEBUG 04-23 00:27:54 [compilation/backends.py:377] Store the 0-th graph for compile range(205, 8192) from inductor_standalone via handle ('artifact_compile_range_205_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/a31c82151b/rank_0_0/backbone/artifact_compile_range_205_8192_subgraph_0') -(Worker_TP1 pid=669) DEBUG 04-23 00:27:54 [compilation/.../utility/noop_elimination.py:105] Removed 2 no-op reshapes and slices -(Worker_TP1 pid=669) DEBUG 04-23 00:27:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP2 pid=670) DEBUG 04-23 00:27:54 [compilation/.../utility/noop_elimination.py:105] Removed 2 no-op reshapes and slices -(Worker_TP2 pid=670) DEBUG 04-23 00:27:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP1 pid=669) DEBUG 04-23 00:27:54 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP1 pid=669) DEBUG 04-23 00:27:54 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 41.2 ms -(Worker_TP1 pid=669) DEBUG 04-23 00:27:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=669) DEBUG 04-23 00:27:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP1 pid=669) DEBUG 04-23 00:27:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP2 pid=670) DEBUG 04-23 00:27:54 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP2 pid=670) DEBUG 04-23 00:27:54 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 41.6 ms -(Worker_TP2 pid=670) DEBUG 04-23 00:27:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP2 pid=670) DEBUG 04-23 00:27:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP2 pid=670) DEBUG 04-23 00:27:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP3 pid=671) DEBUG 04-23 00:27:54 [compilation/.../utility/noop_elimination.py:105] Removed 2 no-op reshapes and slices -(Worker_TP3 pid=671) DEBUG 04-23 00:27:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:27:54 [compilation/.../utility/noop_elimination.py:105] Removed 2 no-op reshapes and slices -(Worker_TP0 pid=668) DEBUG 04-23 00:27:54 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP3 pid=671) DEBUG 04-23 00:27:54 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP3 pid=671) DEBUG 04-23 00:27:54 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 42.2 ms -(Worker_TP3 pid=671) DEBUG 04-23 00:27:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP3 pid=671) DEBUG 04-23 00:27:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP3 pid=671) DEBUG 04-23 00:27:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:27:54 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP0 pid=668) DEBUG 04-23 00:27:54 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 41.6 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:27:54 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:27:54 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP0 pid=668) DEBUG 04-23 00:27:54 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:27:55 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 204) from inductor_standalone via handle ('artifact_compile_range_1_204_subgraph_1', '/data/.cache/vllm/torch_compile_cache/a31c82151b/rank_0_0/backbone/artifact_compile_range_1_204_subgraph_1') -(Worker_TP1 pid=669) DEBUG 04-23 00:27:55 [compilation/.../utility/noop_elimination.py:105] Removed 2 no-op reshapes and slices -(Worker_TP1 pid=669) DEBUG 04-23 00:27:55 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP1 pid=669) DEBUG 04-23 00:27:55 [compilation/passes/pass_manager.py:100] Skipping with compile range (205, 8192) -(Worker_TP1 pid=669) DEBUG 04-23 00:27:55 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP1 pid=669) DEBUG 04-23 00:27:55 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=669) DEBUG 04-23 00:27:55 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(Worker_TP2 pid=670) DEBUG 04-23 00:27:55 [compilation/.../utility/noop_elimination.py:105] Removed 2 no-op reshapes and slices -(Worker_TP2 pid=670) DEBUG 04-23 00:27:55 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP2 pid=670) DEBUG 04-23 00:27:55 [compilation/passes/pass_manager.py:100] Skipping with compile range (205, 8192) -(Worker_TP2 pid=670) DEBUG 04-23 00:27:55 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP2 pid=670) DEBUG 04-23 00:27:55 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP2 pid=670) DEBUG 04-23 00:27:55 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(Worker_TP3 pid=671) DEBUG 04-23 00:27:55 [compilation/.../utility/noop_elimination.py:105] Removed 2 no-op reshapes and slices -(Worker_TP3 pid=671) DEBUG 04-23 00:27:55 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP3 pid=671) DEBUG 04-23 00:27:55 [compilation/passes/pass_manager.py:100] Skipping with compile range (205, 8192) -(Worker_TP3 pid=671) DEBUG 04-23 00:27:55 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP3 pid=671) DEBUG 04-23 00:27:55 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP3 pid=671) DEBUG 04-23 00:27:55 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:27:55 [compilation/.../utility/noop_elimination.py:105] Removed 2 no-op reshapes and slices -(Worker_TP0 pid=668) DEBUG 04-23 00:27:55 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:27:55 [compilation/passes/pass_manager.py:100] Skipping with compile range (205, 8192) -(Worker_TP0 pid=668) DEBUG 04-23 00:27:55 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:27:55 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=668) DEBUG 04-23 00:27:55 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:27:58 [compilation/backends.py:377] Store the 1-th graph for compile range(205, 8192) from inductor_standalone via handle ('artifact_compile_range_205_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/a31c82151b/rank_0_0/backbone/artifact_compile_range_205_8192_subgraph_1') -(Worker_TP0 pid=668) DEBUG 04-23 00:27:59 [compilation/backends.py:377] Store the 2-th graph for compile range(1, 204) from inductor_standalone via handle ('artifact_compile_range_1_204_subgraph_2', '/data/.cache/vllm/torch_compile_cache/a31c82151b/rank_0_0/backbone/artifact_compile_range_1_204_subgraph_2') -(Worker_TP0 pid=668) DEBUG 04-23 00:27:59 [compilation/backends.py:377] Store the 2-th graph for compile range(205, 8192) from inductor_standalone via handle ('artifact_compile_range_205_8192_subgraph_2', '/data/.cache/vllm/torch_compile_cache/a31c82151b/rank_0_0/backbone/artifact_compile_range_205_8192_subgraph_2') -(Worker_TP1 pid=669) DEBUG 04-23 00:27:59 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=669) DEBUG 04-23 00:27:59 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(Worker_TP2 pid=670) DEBUG 04-23 00:27:59 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP2 pid=670) DEBUG 04-23 00:27:59 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(Worker_TP1 pid=669) DEBUG 04-23 00:27:59 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP1 pid=669) DEBUG 04-23 00:27:59 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.8 ms -(Worker_TP1 pid=669) DEBUG 04-23 00:27:59 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP1 pid=669) DEBUG 04-23 00:27:59 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP1 pid=669) DEBUG 04-23 00:27:59 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP2 pid=670) DEBUG 04-23 00:27:59 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP2 pid=670) DEBUG 04-23 00:27:59 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.3 ms -(Worker_TP2 pid=670) DEBUG 04-23 00:27:59 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP2 pid=670) DEBUG 04-23 00:27:59 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP2 pid=670) DEBUG 04-23 00:27:59 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:27:59 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=668) DEBUG 04-23 00:27:59 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms -(Worker_TP3 pid=671) DEBUG 04-23 00:27:59 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP3 pid=671) DEBUG 04-23 00:27:59 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:27:59 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP0 pid=668) DEBUG 04-23 00:27:59 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.7 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:27:59 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:27:59 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP0 pid=668) DEBUG 04-23 00:27:59 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP3 pid=671) DEBUG 04-23 00:28:00 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP3 pid=671) DEBUG 04-23 00:28:00 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 40.7 ms -(Worker_TP3 pid=671) DEBUG 04-23 00:28:00 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP3 pid=671) DEBUG 04-23 00:28:00 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP3 pid=671) DEBUG 04-23 00:28:00 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:28:00 [compilation/backends.py:377] Store the 3-th graph for compile range(1, 204) from inductor_standalone via handle ('artifact_compile_range_1_204_subgraph_3', '/data/.cache/vllm/torch_compile_cache/a31c82151b/rank_0_0/backbone/artifact_compile_range_1_204_subgraph_3') -(Worker_TP1 pid=669) DEBUG 04-23 00:28:00 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=669) DEBUG 04-23 00:28:00 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms -(Worker_TP1 pid=669) DEBUG 04-23 00:28:00 [compilation/passes/pass_manager.py:100] Skipping with compile range (205, 8192) -(Worker_TP1 pid=669) DEBUG 04-23 00:28:00 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP1 pid=669) DEBUG 04-23 00:28:00 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=669) DEBUG 04-23 00:28:00 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(Worker_TP2 pid=670) DEBUG 04-23 00:28:00 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP2 pid=670) DEBUG 04-23 00:28:00 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms -(Worker_TP2 pid=670) DEBUG 04-23 00:28:00 [compilation/passes/pass_manager.py:100] Skipping with compile range (205, 8192) -(Worker_TP2 pid=670) DEBUG 04-23 00:28:00 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP2 pid=670) DEBUG 04-23 00:28:00 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP2 pid=670) DEBUG 04-23 00:28:00 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:28:00 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=668) DEBUG 04-23 00:28:00 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:28:00 [compilation/passes/pass_manager.py:100] Skipping with compile range (205, 8192) -(Worker_TP0 pid=668) DEBUG 04-23 00:28:00 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:28:00 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=668) DEBUG 04-23 00:28:00 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.6 ms -(Worker_TP3 pid=671) DEBUG 04-23 00:28:00 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP3 pid=671) DEBUG 04-23 00:28:00 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms -(Worker_TP3 pid=671) DEBUG 04-23 00:28:00 [compilation/passes/pass_manager.py:100] Skipping with compile range (205, 8192) -(Worker_TP3 pid=671) DEBUG 04-23 00:28:00 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP3 pid=671) DEBUG 04-23 00:28:00 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP3 pid=671) DEBUG 04-23 00:28:00 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:28:00 [compilation/backends.py:377] Store the 3-th graph for compile range(205, 8192) from inductor_standalone via handle ('artifact_compile_range_205_8192_subgraph_3', '/data/.cache/vllm/torch_compile_cache/a31c82151b/rank_0_0/backbone/artifact_compile_range_205_8192_subgraph_3') -(APIServer pid=1) DEBUG 04-23 00:28:01 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP2 pid=670) DEBUG 04-23 00:28:03 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP2 pid=670) DEBUG 04-23 00:28:03 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP1 pid=669) DEBUG 04-23 00:28:04 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=669) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(Worker_TP2 pid=670) DEBUG 04-23 00:28:04 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP2 pid=670) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 37.1 ms -(Worker_TP2 pid=670) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms -(Worker_TP2 pid=670) DEBUG 04-23 00:28:04 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP2 pid=670) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP1 pid=669) DEBUG 04-23 00:28:04 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP1 pid=669) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.3 ms -(Worker_TP1 pid=669) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms -(Worker_TP1 pid=669) DEBUG 04-23 00:28:04 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP1 pid=669) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:28:04 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=668) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:28:04 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP0 pid=668) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 37.8 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:28:04 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP0 pid=668) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP3 pid=671) DEBUG 04-23 00:28:04 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP3 pid=671) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:28:04 [compilation/backends.py:377] Store the 48-th graph for compile range(1, 204) from inductor_standalone via handle ('artifact_compile_range_1_204_subgraph_48', '/data/.cache/vllm/torch_compile_cache/a31c82151b/rank_0_0/backbone/artifact_compile_range_1_204_subgraph_48') -(Worker_TP0 pid=668) INFO 04-23 00:28:04 [compilation/backends.py:390] Compiling a graph for compile range (1, 204) takes 11.13 s -(Worker_TP2 pid=670) DEBUG 04-23 00:28:04 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP2 pid=670) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms -(Worker_TP2 pid=670) DEBUG 04-23 00:28:04 [compilation/passes/pass_manager.py:100] Skipping with compile range (205, 8192) -(Worker_TP2 pid=670) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP2 pid=670) DEBUG 04-23 00:28:04 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP2 pid=670) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(Worker_TP3 pid=671) DEBUG 04-23 00:28:04 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP3 pid=671) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.4 ms -(Worker_TP3 pid=671) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms -(Worker_TP3 pid=671) DEBUG 04-23 00:28:04 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP3 pid=671) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP1 pid=669) DEBUG 04-23 00:28:04 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=669) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(Worker_TP1 pid=669) DEBUG 04-23 00:28:04 [compilation/passes/pass_manager.py:100] Skipping with compile range (205, 8192) -(Worker_TP1 pid=669) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP1 pid=669) DEBUG 04-23 00:28:04 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=669) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:28:04 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=668) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:28:04 [compilation/passes/pass_manager.py:100] Skipping with compile range (205, 8192) -(Worker_TP0 pid=668) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:28:04 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=668) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(Worker_TP3 pid=671) DEBUG 04-23 00:28:04 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP3 pid=671) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms -(Worker_TP3 pid=671) DEBUG 04-23 00:28:04 [compilation/passes/pass_manager.py:100] Skipping with compile range (205, 8192) -(Worker_TP3 pid=671) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP3 pid=671) DEBUG 04-23 00:28:04 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP3 pid=671) DEBUG 04-23 00:28:04 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(Worker_TP0 pid=668) DEBUG 04-23 00:28:05 [compilation/backends.py:377] Store the 48-th graph for compile range(205, 8192) from inductor_standalone via handle ('artifact_compile_range_205_8192_subgraph_48', '/data/.cache/vllm/torch_compile_cache/a31c82151b/rank_0_0/backbone/artifact_compile_range_205_8192_subgraph_48') -(Worker_TP0 pid=668) INFO 04-23 00:28:05 [compilation/backends.py:390] Compiling a graph for compile range (205, 8192) takes 12.01 s -(Worker_TP0 pid=668) DEBUG 04-23 00:28:05 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/a31c82151b/rank_0_0/backbone/computation_graph.py -(Worker_TP0 pid=668) INFO 04-23 00:28:07 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/3dd8de28feb215835436b71e8332dffd08ece5a27a87859d202824d9bdf14468/rank_0_0/model -(Worker_TP0 pid=668) INFO 04-23 00:28:07 [compilation/monitor.py:48] torch.compile took 29.69 s in total -(Worker_TP0 pid=668) WARNING 04-23 00:28:08 [model_executor/.../fused_moe/fused_moe.py:1090] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json -(Worker_TP0 pid=668) INFO 04-23 00:28:09 [compilation/monitor.py:76] Initial profiling/warmup run took 2.14 s -(APIServer pid=1) DEBUG 04-23 00:28:11 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=668) INFO 04-23 00:28:15 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP0 pid=668) WARNING 04-23 00:28:15 [v1/worker/gpu_model_runner.py:6339] CUDAGraphMode.FULL_AND_PIECEWISE is not supported with ChunkedLocalAttention_8192_FlashAttentionBackend backend (support: AttentionCGSupport.NEVER); setting cudagraph_mode=PIECEWISE because attention is compiled piecewise -(Worker_TP0 pid=668) DEBUG 04-23 00:28:15 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP0 pid=668) INFO 04-23 00:28:15 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512) -(Worker_TP3 pid=671) INFO 04-23 00:28:15 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP3 pid=671) WARNING 04-23 00:28:15 [v1/worker/gpu_model_runner.py:6339] CUDAGraphMode.FULL_AND_PIECEWISE is not supported with ChunkedLocalAttention_8192_FlashAttentionBackend backend (support: AttentionCGSupport.NEVER); setting cudagraph_mode=PIECEWISE because attention is compiled piecewise -(Worker_TP1 pid=669) INFO 04-23 00:28:15 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP1 pid=669) WARNING 04-23 00:28:15 [v1/worker/gpu_model_runner.py:6339] CUDAGraphMode.FULL_AND_PIECEWISE is not supported with ChunkedLocalAttention_8192_FlashAttentionBackend backend (support: AttentionCGSupport.NEVER); setting cudagraph_mode=PIECEWISE because attention is compiled piecewise -(Worker_TP3 pid=671) DEBUG 04-23 00:28:15 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP3 pid=671) INFO 04-23 00:28:15 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512) -(Worker_TP1 pid=669) DEBUG 04-23 00:28:15 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP1 pid=669) INFO 04-23 00:28:15 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512) -(Worker_TP2 pid=670) INFO 04-23 00:28:15 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP2 pid=670) WARNING 04-23 00:28:15 [v1/worker/gpu_model_runner.py:6339] CUDAGraphMode.FULL_AND_PIECEWISE is not supported with ChunkedLocalAttention_8192_FlashAttentionBackend backend (support: AttentionCGSupport.NEVER); setting cudagraph_mode=PIECEWISE because attention is compiled piecewise -(Worker_TP2 pid=670) DEBUG 04-23 00:28:15 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP2 pid=670) INFO 04-23 00:28:15 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512) -(Worker_TP0 pid=668) DEBUG 04-23 00:28:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=669) DEBUG 04-23 00:28:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=671) DEBUG 04-23 00:28:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=670) DEBUG 04-23 00:28:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=668) DEBUG 04-23 00:28:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=668) DEBUG 04-23 00:28:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=668) DEBUG 04-23 00:28:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=669) DEBUG 04-23 00:28:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=669) DEBUG 04-23 00:28:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=668) DEBUG 04-23 00:28:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=668) DEBUG 04-23 00:28:16 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP3 pid=671) DEBUG 04-23 00:28:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=671) DEBUG 04-23 00:28:16 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=669) DEBUG 04-23 00:28:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=668) DEBUG 04-23 00:28:16 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 112.00 MiB first-capture + (51-1) × 20.00 MiB per-graph -(Worker_TP0 pid=668) INFO 04-23 00:28:16 [distributed/device_communicators/custom_all_reduce.py:216] Registering 192 cuda graph addresses -(Worker_TP3 pid=671) DEBUG 04-23 00:28:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=669) DEBUG 04-23 00:28:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=669) DEBUG 04-23 00:28:16 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP3 pid=671) DEBUG 04-23 00:28:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=671) DEBUG 04-23 00:28:16 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP2 pid=670) DEBUG 04-23 00:28:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=670) DEBUG 04-23 00:28:16 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=669) DEBUG 04-23 00:28:16 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 112.00 MiB first-capture + (51-1) × 20.00 MiB per-graph -(Worker_TP1 pid=669) INFO 04-23 00:28:16 [distributed/device_communicators/custom_all_reduce.py:216] Registering 192 cuda graph addresses -(Worker_TP3 pid=671) DEBUG 04-23 00:28:16 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 112.00 MiB first-capture + (51-1) × 20.00 MiB per-graph -(Worker_TP3 pid=671) INFO 04-23 00:28:16 [distributed/device_communicators/custom_all_reduce.py:216] Registering 192 cuda graph addresses -(Worker_TP2 pid=670) DEBUG 04-23 00:28:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=670) DEBUG 04-23 00:28:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=670) DEBUG 04-23 00:28:16 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP2 pid=670) DEBUG 04-23 00:28:16 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 112.00 MiB first-capture + (51-1) × 20.00 MiB per-graph -(Worker_TP2 pid=670) INFO 04-23 00:28:16 [distributed/device_communicators/custom_all_reduce.py:216] Registering 192 cuda graph addresses -(Worker_TP0 pid=668) DEBUG 04-23 00:28:17 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP0 pid=668) INFO 04-23 00:28:17 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.09 GiB total -(Worker_TP3 pid=671) DEBUG 04-23 00:28:17 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP3 pid=671) INFO 04-23 00:28:17 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.09 GiB total -(Worker_TP1 pid=669) DEBUG 04-23 00:28:17 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP1 pid=669) INFO 04-23 00:28:17 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.09 GiB total -(Worker_TP2 pid=670) DEBUG 04-23 00:28:17 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP2 pid=670) INFO 04-23 00:28:17 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.09 GiB total -(Worker_TP3 pid=671) DEBUG 04-23 00:28:17 [v1/worker/gpu_worker.py:424] Initial free memory: 77.12 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP3 pid=671) DEBUG 04-23 00:28:17 [v1/worker/gpu_worker.py:430] Free memory after profiling: 21.22 GiB (total), 19.33 GiB (within requested) -(Worker_TP3 pid=671) DEBUG 04-23 00:28:17 [v1/worker/gpu_worker.py:435] Memory profiling takes 40.78 seconds. Total non KV cache memory: 58.48GiB; torch peak memory increase: 3.19GiB; non-torch forward increase memory: 2.17GiB; weights memory: 53.12GiB. -(Worker_TP3 pid=671) INFO 04-23 00:28:17 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9637 to maintain the same effective KV cache size. -(Worker_TP3 pid=671) DEBUG 04-23 00:28:17 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=668) DEBUG 04-23 00:28:17 [v1/worker/gpu_worker.py:424] Initial free memory: 77.12 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP0 pid=668) DEBUG 04-23 00:28:17 [v1/worker/gpu_worker.py:430] Free memory after profiling: 21.22 GiB (total), 19.33 GiB (within requested) -(Worker_TP0 pid=668) DEBUG 04-23 00:28:17 [v1/worker/gpu_worker.py:435] Memory profiling takes 40.70 seconds. Total non KV cache memory: 58.48GiB; torch peak memory increase: 3.19GiB; non-torch forward increase memory: 2.17GiB; weights memory: 53.12GiB. -(Worker_TP0 pid=668) INFO 04-23 00:28:17 [v1/worker/gpu_worker.py:436] Available KV cache memory: 16.75 GiB -(Worker_TP0 pid=668) INFO 04-23 00:28:17 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9637 to maintain the same effective KV cache size. -(Worker_TP0 pid=668) DEBUG 04-23 00:28:17 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=469) DEBUG 04-23 00:28:17 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=469) DEBUG 04-23 00:28:17 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=669) DEBUG 04-23 00:28:17 [v1/worker/gpu_worker.py:424] Initial free memory: 77.12 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP1 pid=669) DEBUG 04-23 00:28:17 [v1/worker/gpu_worker.py:430] Free memory after profiling: 21.22 GiB (total), 19.33 GiB (within requested) -(Worker_TP1 pid=669) DEBUG 04-23 00:28:17 [v1/worker/gpu_worker.py:435] Memory profiling takes 40.73 seconds. Total non KV cache memory: 58.48GiB; torch peak memory increase: 3.19GiB; non-torch forward increase memory: 2.17GiB; weights memory: 53.12GiB. -(Worker_TP1 pid=669) INFO 04-23 00:28:17 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9637 to maintain the same effective KV cache size. -(Worker_TP1 pid=669) DEBUG 04-23 00:28:17 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=469) DEBUG 04-23 00:28:17 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=469) DEBUG 04-23 00:28:17 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP2 pid=670) DEBUG 04-23 00:28:17 [v1/worker/gpu_worker.py:424] Initial free memory: 77.12 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP2 pid=670) DEBUG 04-23 00:28:17 [v1/worker/gpu_worker.py:430] Free memory after profiling: 21.22 GiB (total), 19.33 GiB (within requested) -(Worker_TP2 pid=670) DEBUG 04-23 00:28:17 [v1/worker/gpu_worker.py:435] Memory profiling takes 40.85 seconds. Total non KV cache memory: 58.48GiB; torch peak memory increase: 3.19GiB; non-torch forward increase memory: 2.17GiB; weights memory: 53.12GiB. -(Worker_TP2 pid=670) INFO 04-23 00:28:17 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9637 to maintain the same effective KV cache size. -(Worker_TP2 pid=670) DEBUG 04-23 00:28:17 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=469) DEBUG 04-23 00:28:17 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=469) INFO 04-23 00:28:17 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 365,968 tokens -(EngineCore pid=469) INFO 04-23 00:28:17 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 44.68x -(Worker_TP0 pid=668) DEBUG 04-23 00:28:17 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=669) DEBUG 04-23 00:28:17 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP3 pid=671) DEBUG 04-23 00:28:17 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP2 pid=670) DEBUG 04-23 00:28:17 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP2 pid=670) 2026-04-23 00:28:17,637 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP1 pid=669) 2026-04-23 00:28:17,637 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP3 pid=671) 2026-04-23 00:28:17,637 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP0 pid=668) 2026-04-23 00:28:17,637 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP2 pid=670) DEBUG 04-23 00:28:17 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=669) DEBUG 04-23 00:28:17 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=671) DEBUG 04-23 00:28:17 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=668) DEBUG 04-23 00:28:17 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=668) 2026-04-23 00:28:17,696 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP1 pid=669) 2026-04-23 00:28:17,696 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP3 pid=671) 2026-04-23 00:28:17,696 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP2 pid=670) 2026-04-23 00:28:17,696 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP0 pid=668) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00: ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=469) DEBUG 04-23 00:28:33 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=469) DEBUG 04-23 00:28:39 [v1/engine/core.py:190] Batch queue is enabled with size 2 -(EngineCore pid=469) DEBUG 04-23 00:28:40 [utils/gc_utils.py:40] GC Debug Config. enabled:False,top_objects:-1 -(EngineCore pid=469) INFO 04-23 00:28:40 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-23 00:28:40 [v1/engine/utils.py:1158] READY from local core engine process 0. -(APIServer pid=1) DEBUG 04-23 00:28:40 [v1/metrics/loggers.py:273] Engine 000: vllm cache_config_info with initialization after num_gpu_blocks is: 91495 -(EngineCore pid=469) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=469) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=469) INFO 04-23 00:28:40 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(Worker_TP0 pid=668) DEBUG 04-23 00:28:40 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=669) DEBUG 04-23 00:28:40 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP2 pid=670) DEBUG 04-23 00:28:40 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP3 pid=671) DEBUG 04-23 00:28:40 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=469) DEBUG 04-23 00:28:40 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=469) DEBUG 04-23 00:28:40 [v1/engine/core.py:1158] EngineCore waiting for work. -(EngineCore pid=469) DEBUG 04-23 00:28:40 [v1/engine/core.py:1158] EngineCore waiting for work. -(APIServer pid=1) INFO 04-23 00:28:40 [entrypoints/openai/api_server.py:590] Supported tasks: ['generate'] -(APIServer pid=1) WARNING 04-23 00:28:40 [config/model.py:1435] Default vLLM sampling parameters have been overridden by the model's `generation_config.json`: `{'temperature': 0.6, 'top_p': 0.9}`. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`. -(APIServer pid=1) DEBUG 04-23 00:28:40 [renderers/base.py:197] Warming up chat template processing... -(Worker_TP1 pid=669) DEBUG 04-23 00:28:41 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP2 pid=670) DEBUG 04-23 00:28:41 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=668) DEBUG 04-23 00:28:41 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP3 pid=671) DEBUG 04-23 00:28:41 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(APIServer pid=1) INFO 04-23 00:28:42 [renderers/hf.py:314] Detected the chat template content format to be 'openai'. You can set `--chat-template-content-format` to override this. -(APIServer pid=1) DEBUG 04-23 00:28:42 [renderers/base.py:203] Chat template warmup completed in 2.133s -(APIServer pid=1) DEBUG 04-23 00:28:42 [renderers/base.py:218] Warming up multi-modal processing... -(APIServer pid=1) INFO 04-23 00:28:47 [renderers/base.py:231] Multi-modal warmup completed in 4.752s -(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/openai/api_server.py:594] Starting vLLM server on http://0.0.0.0:8000 -(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:37] Available routes are: -(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /openapi.json, Methods: GET, HEAD -(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /docs, Methods: GET, HEAD -(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /docs/oauth2-redirect, Methods: GET, HEAD -(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /redoc, Methods: GET, HEAD -(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /tokenize, Methods: POST -(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /detokenize, Methods: POST -(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /load, Methods: GET -(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /version, Methods: GET -(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /health, Methods: GET -(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /metrics, Methods: GET -(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /v1/models, Methods: GET -(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /ping, Methods: GET -(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /ping, Methods: POST -(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /invocations, Methods: POST -(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /v1/chat/completions, Methods: POST -(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST -(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /v1/responses, Methods: POST -(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET -(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST -(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /v1/completions, Methods: POST -(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /v1/messages, Methods: POST -(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST -(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /inference/v1/generate, Methods: POST -(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /scale_elastic_ep, Methods: POST -(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST -(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /v1/chat/completions/render, Methods: POST -(APIServer pid=1) INFO 04-23 00:28:47 [entrypoints/launcher.py:46] Route: /v1/completions/render, Methods: POST -(APIServer pid=1) INFO: Started server process [1] -(APIServer pid=1) INFO: Waiting for application startup. -(APIServer pid=1) INFO: Application startup complete. -(APIServer pid=1) DEBUG 04-23 00:28:49 [v1/engine/async_llm.py:875] Called check_health. -(APIServer pid=1) INFO: 10.128.10.2:42458 - "GET /health HTTP/1.1" 200 OK diff --git a/accuracy/results/v0.19.0/logs/microsoft-phi-2--h100-80gb--tp1pp1dp1--2048.log b/accuracy/results/v0.19.0/logs/microsoft-phi-2--h100-80gb--tp1pp1dp1--2048.log deleted file mode 100644 index 32d7d147..00000000 --- a/accuracy/results/v0.19.0/logs/microsoft-phi-2--h100-80gb--tp1pp1dp1--2048.log +++ /dev/null @@ -1,769 +0,0 @@ -DEBUG 04-22 20:00:23 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 20:00:23 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 20:00:23 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 20:00:23 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 20:00:23 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 20:00:23 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 20:00:23 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 20:00:23 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 20:00:23 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 20:00:23 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 20:00:23 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 20:00:23 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 20:00:28 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 20:00:30 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' -DEBUG 04-22 20:00:30 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 20:00:30 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 20:00:30 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 20:00:30 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 20:00:30 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 20:00:30 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 20:00:30 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 20:00:30 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model microsoft/phi-2 -(APIServer pid=1) INFO 04-22 20:00:30 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 20:00:30 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 20:00:30 [entrypoints/utils.py:233] non-default args: {'model_tag': 'microsoft/phi-2', 'model': 'microsoft/phi-2', 'max_model_len': 2048, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 20:00:30 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 20:00:30 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.phi.PhiForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 20:00:30 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0016507 secs -(APIServer pid=1) INFO 04-22 20:00:30 [config/model.py:549] Resolved architecture: PhiForCausalLM -(APIServer pid=1) INFO 04-22 20:00:30 [config/model.py:1678] Using max model len 2048 -(APIServer pid=1) DEBUG 04-22 20:00:30 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 20:00:30 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 20:00:30 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 20:00:30 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 20:00:30 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 20:00:30 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 20:00:30 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 20:00:30 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 20:00:30 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 20:00:31 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 20:00:31 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 20:00:35 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 20:00:35 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 20:00:35 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 20:00:35 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 20:00:35 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 20:00:35 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 20:00:35 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 20:00:35 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 20:00:35 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 20:00:35 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 20:00:35 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 20:00:35 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 20:00:40 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=245) DEBUG 04-22 20:00:41 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 20:00:41 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=245) DEBUG 04-22 20:00:41 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/eca9f98d-548f-4f72-b376-b1acbadfa08d'], outputs=['ipc:///tmp/c0a62889-c208-4c99-bf84-864ec7c7abaf'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=245) DEBUG 04-22 20:00:41 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=245) DEBUG 04-22 20:00:41 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=245) DEBUG 04-22 20:00:41 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=245) DEBUG 04-22 20:00:41 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=245) DEBUG 04-22 20:00:41 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=245) INFO 04-22 20:00:41 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='microsoft/phi-2', speculative_config=None, tokenizer='microsoft/phi-2', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=microsoft/phi-2, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=245) WARNING 04-22 20:00:42 [platforms/interface.py:525] Using 'pin_memory=False' as WSL is detected. This may slow down the performance. -(EngineCore pid=245) DEBUG 04-22 20:00:42 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.130.3.163:54515 backend=nccl -(EngineCore pid=245) INFO 04-22 20:00:42 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.130.3.163:54515 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=245) DEBUG 04-22 20:00:42 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=245) INFO 04-22 20:00:42 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=245) DEBUG 04-22 20:00:42 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776888042.7716968, auto_measure=True -(EngineCore pid=245) DEBUG 04-22 20:00:42 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=245) DEBUG 04-22 20:00:42 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=245) DEBUG 04-22 20:00:42 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=245) DEBUG 04-22 20:00:42 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=245) DEBUG 04-22 20:00:42 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=245) DEBUG 04-22 20:00:42 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=245) DEBUG 04-22 20:00:42 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=245) DEBUG 04-22 20:00:42 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=245) DEBUG 04-22 20:00:42 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=245) INFO 04-22 20:00:42 [v1/worker/gpu_model_runner.py:4735] Starting to load model microsoft/phi-2... -(EngineCore pid=245) DEBUG 04-22 20:00:43 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=80, dtype=torch.float16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {FLASHINFER: [head_size not supported]}. -(EngineCore pid=245) INFO 04-22 20:00:43 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=245) INFO 04-22 20:00:43 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=245) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=245) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=245) DEBUG 04-22 20:00:43 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=245) DEBUG 04-22 20:00:43 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=245) DEBUG 04-22 20:00:43 [config/compilation.py:1195] disabled custom ops: Counter({'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'gelu_new': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=245) DEBUG 04-22 20:00:43 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=245) DEBUG 04-22 20:00:43 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00001-of-00002.safetensors', 'model-00002-of-00002.safetensors']] -(APIServer pid=1) DEBUG 04-22 20:00:51 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 20:01:01 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 20:01:11 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 20:01:21 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-22 20:01:31 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=245) INFO 04-22 20:01:37 [model_executor/model_loader/weight_utils.py:581] Time spent downloading weights for microsoft/phi-2: 53.079146 seconds -(EngineCore pid=245) Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00 -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/normalization.py -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/phi.py -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=245) INFO 04-22 20:01:46 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/99e20e2350/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=610adf44e9 comp=e546579c48 code=72f26a8252bd2dc07110f02adcd5ec7b0a60ebd294b581ec9fa5989f3d1bc98e dir=/data/.cache/vllm/torch_compile_cache/99e20e2350/rank_0_0/backbone -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/backends.py:1074] Vllm config hash: 610adf44e9 -(EngineCore pid=245) INFO 04-22 20:01:46 [compilation/backends.py:1111] Dynamo bytecode transform time: 3.52 s -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=245) DEBUG 04-22 20:01:46 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=245) DEBUG 04-22 20:01:47 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=245) DEBUG 04-22 20:01:47 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(EngineCore pid=245) DEBUG 04-22 20:01:47 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms -(EngineCore pid=245) DEBUG 04-22 20:01:47 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=245) DEBUG 04-22 20:01:47 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=245) INFO 04-22 20:01:48 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=245) DEBUG 04-22 20:01:48 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/99e20e2350/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=245) DEBUG 04-22 20:01:49 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=245) DEBUG 04-22 20:01:49 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.8 ms -(EngineCore pid=245) DEBUG 04-22 20:01:49 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 7.0 ms -(EngineCore pid=245) DEBUG 04-22 20:01:49 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=245) DEBUG 04-22 20:01:49 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 1.2 ms -(EngineCore pid=245) DEBUG 04-22 20:01:50 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/99e20e2350/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=245) DEBUG 04-22 20:01:51 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=245) DEBUG 04-22 20:01:51 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms -(EngineCore pid=245) DEBUG 04-22 20:01:51 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms -(EngineCore pid=245) DEBUG 04-22 20:01:51 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=245) DEBUG 04-22 20:01:51 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(EngineCore pid=245) DEBUG 04-22 20:01:51 [compilation/backends.py:377] Store the 32-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_32', '/data/.cache/vllm/torch_compile_cache/99e20e2350/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_32') -(EngineCore pid=245) INFO 04-22 20:01:51 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.04 s -(EngineCore pid=245) DEBUG 04-22 20:01:51 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/99e20e2350/rank_0_0/backbone/computation_graph.py -(APIServer pid=1) DEBUG 04-22 20:01:51 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=245) INFO 04-22 20:01:52 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/072de5f68642eb70f697736336cf9589eebd286ea377b4e297ad05380d4998c3/rank_0_0/model -(EngineCore pid=245) INFO 04-22 20:01:52 [compilation/monitor.py:48] torch.compile took 10.19 s in total -(EngineCore pid=245) INFO 04-22 20:01:53 [compilation/monitor.py:76] Initial profiling/warmup run took 0.42 s -(EngineCore pid=245) INFO 04-22 20:01:58 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=245) DEBUG 04-22 20:01:58 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=245) INFO 04-22 20:01:58 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=245) DEBUG 04-22 20:01:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=245) DEBUG 04-22 20:01:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=245) DEBUG 04-22 20:01:58 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=245) DEBUG 04-22 20:01:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=245) DEBUG 04-22 20:01:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=245) DEBUG 04-22 20:01:58 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=245) DEBUG 04-22 20:01:58 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 100.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(EngineCore pid=245) DEBUG 04-22 20:01:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=245) DEBUG 04-22 20:01:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=245) DEBUG 04-22 20:01:58 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=245) DEBUG 04-22 20:01:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=245) DEBUG 04-22 20:01:58 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=245) DEBUG 04-22 20:01:58 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=245) DEBUG 04-22 20:01:58 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 166.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(EngineCore pid=245) DEBUG 04-22 20:01:59 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=245) INFO 04-22 20:01:59 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.55 GiB total -(EngineCore pid=245) DEBUG 04-22 20:01:59 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=245) DEBUG 04-22 20:01:59 [v1/worker/gpu_worker.py:430] Free memory after profiling: 73.04 GiB (total), 69.59 GiB (within requested) -(EngineCore pid=245) DEBUG 04-22 20:01:59 [v1/worker/gpu_worker.py:435] Memory profiling takes 17.32 seconds. Total non KV cache memory: 6.23GiB; torch peak memory increase: 0.79GiB; non-torch forward increase memory: 0.24GiB; weights memory: 5.19GiB. -(EngineCore pid=245) INFO 04-22 20:01:59 [v1/worker/gpu_worker.py:436] Available KV cache memory: 69.0 GiB -(EngineCore pid=245) INFO 04-22 20:01:59 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9570 to maintain the same effective KV cache size. -(EngineCore pid=245) INFO 04-22 20:01:59 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 226,112 tokens -(EngineCore pid=245) INFO 04-22 20:01:59 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 2,048 tokens per request: 110.41x -(EngineCore pid=245) 2026-04-22 20:01:59,942 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=245) DEBUG 04-22 20:01:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=245) 2026-04-22 20:01:59,954 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=245) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 19:55:02 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 19:55:02 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 19:55:02 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 19:55:02 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 19:55:02 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 19:55:02 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model microsoft/phi-2 -(APIServer pid=1) INFO 04-22 19:55:02 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 19:55:02 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 19:55:02 [entrypoints/utils.py:233] non-default args: {'model_tag': 'microsoft/phi-2', 'model': 'microsoft/phi-2', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 19:55:02 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 19:55:03 [model_executor/models/registry.py:774] Cached model info file for class vllm.model_executor.models.phi.PhiForCausalLM not found -(APIServer pid=1) DEBUG 04-22 19:55:03 [model_executor/models/registry.py:834] Cache model info for class vllm.model_executor.models.phi.PhiForCausalLM miss. Loading model instead. -(APIServer pid=1) DEBUG 04-22 19:55:14 [model_executor/models/registry.py:844] Loaded model info for class vllm.model_executor.models.phi.PhiForCausalLM -(APIServer pid=1) DEBUG 04-22 19:55:14 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 10.8903088 secs -(APIServer pid=1) INFO 04-22 19:55:14 [config/model.py:549] Resolved architecture: PhiForCausalLM -(APIServer pid=1) Traceback (most recent call last): -(APIServer pid=1) File "/usr/local/bin/vllm", line 10, in -(APIServer pid=1) sys.exit(main()) -(APIServer pid=1) ^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main -(APIServer pid=1) args.dispatch_function(args) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd -(APIServer pid=1) uvloop.run(run_server(args)) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run -(APIServer pid=1) return __asyncio.run( -(APIServer pid=1) ^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run -(APIServer pid=1) return runner.run(main) -(APIServer pid=1) ^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run -(APIServer pid=1) return self._loop.run_until_complete(task) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper -(APIServer pid=1) return await main -(APIServer pid=1) ^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server -(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker -(APIServer pid=1) async with build_async_engine_client( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ -(APIServer pid=1) return await anext(self.gen) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client -(APIServer pid=1) async with build_async_engine_client_from_engine_args( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ -(APIServer pid=1) return await anext(self.gen) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 124, in build_async_engine_client_from_engine_args -(APIServer pid=1) vllm_config = engine_args.create_engine_config(usage_context=usage_context) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/engine/arg_utils.py", line 1549, in create_engine_config -(APIServer pid=1) model_config = self.create_model_config() -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/engine/arg_utils.py", line 1398, in create_model_config -(APIServer pid=1) return ModelConfig( -(APIServer pid=1) ^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/pydantic/_internal/_dataclasses.py", line 121, in __init__ -(APIServer pid=1) s.__pydantic_validator__.validate_python(ArgsKwargs(args, kwargs), self_instance=s) -(APIServer pid=1) pydantic_core._pydantic_core.ValidationError: 1 validation error for ModelConfig -(APIServer pid=1) Value error, User-specified max_model_len (8192) is greater than the derived max_model_len (max_position_embeddings=2048.0 or model_max_length=None in model's config.json). To allow overriding this maximum, set the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN=1. VLLM_ALLOW_LONG_MAX_MODEL_LEN must be used with extreme caution. If the model uses relative position encoding (RoPE), positions exceeding derived_max_model_len lead to nan. If the model uses absolute position encoding, positions exceeding derived_max_model_len will cause a CUDA array out-of-bounds error. [type=value_error, input_value=ArgsKwargs((), {'model': ...nderer_num_workers': 1}), input_type=ArgsKwargs] -(APIServer pid=1) For further information visit https://errors.pydantic.dev/2.12/v/value_error diff --git a/accuracy/results/v0.19.0/logs/microsoft-phi-4--h100-80gb--tp1pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/microsoft-phi-4--h100-80gb--tp1pp1dp1--8192.log deleted file mode 100644 index aea7d3e0..00000000 --- a/accuracy/results/v0.19.0/logs/microsoft-phi-4--h100-80gb--tp1pp1dp1--8192.log +++ /dev/null @@ -1,752 +0,0 @@ -DEBUG 04-22 00:52:23 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:52:23 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:52:23 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:52:23 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:52:23 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:52:23 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:52:23 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:52:23 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:52:23 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:52:23 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:52:23 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:52:23 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:52:27 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 00:52:29 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' -DEBUG 04-22 00:52:29 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 00:52:29 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:52:29 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:52:29 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 00:52:29 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:52:29 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 00:52:29 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 00:52:29 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model microsoft/phi-4 -(APIServer pid=1) INFO 04-22 00:52:29 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 00:52:29 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:52:29 [entrypoints/utils.py:233] non-default args: {'model_tag': 'microsoft/phi-4', 'model': 'microsoft/phi-4', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 00:52:29 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 00:52:30 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.phi3.Phi3ForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 00:52:30 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0007826 secs -(APIServer pid=1) INFO 04-22 00:52:30 [config/model.py:549] Resolved architecture: Phi3ForCausalLM -(APIServer pid=1) INFO 04-22 00:52:30 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 00:52:30 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 00:52:30 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 00:52:30 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 00:52:30 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 00:52:30 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 00:52:30 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 00:52:30 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 00:52:30 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 00:52:30 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:52:30 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:52:30 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 00:52:34 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:52:34 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:52:34 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:52:34 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:52:34 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:52:34 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:52:34 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:52:34 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:52:34 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:52:34 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:52:34 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:52:34 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:52:39 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(APIServer pid=1) DEBUG 04-22 00:52:40 [v1/engine/utils.py:1042] Waiting for 1 local, 0 remote core engine proc(s) to connect. -(EngineCore pid=244) DEBUG 04-22 00:52:40 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 00:52:40 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=244) DEBUG 04-22 00:52:40 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/12badbc3-9925-4219-a699-30fb51bff4ce'], outputs=['ipc:///tmp/bc7aadab-45d1-4425-969a-6f2a0352f1ba'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=244) DEBUG 04-22 00:52:40 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=244) DEBUG 04-22 00:52:40 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=244) DEBUG 04-22 00:52:40 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=244) DEBUG 04-22 00:52:40 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=244) DEBUG 04-22 00:52:40 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=244) INFO 04-22 00:52:40 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='microsoft/phi-4', speculative_config=None, tokenizer='microsoft/phi-4', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=microsoft/phi-4, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=244) WARNING 04-22 00:52:40 [platforms/interface.py:525] Using 'pin_memory=False' as WSL is detected. This may slow down the performance. -(EngineCore pid=244) DEBUG 04-22 00:52:41 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.131.7.82:45741 backend=nccl -(EngineCore pid=244) INFO 04-22 00:52:41 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.131.7.82:45741 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=244) DEBUG 04-22 00:52:41 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=244) INFO 04-22 00:52:41 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=244) DEBUG 04-22 00:52:41 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776819161.606759, auto_measure=True -(EngineCore pid=244) DEBUG 04-22 00:52:41 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=244) DEBUG 04-22 00:52:41 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=244) DEBUG 04-22 00:52:41 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=244) DEBUG 04-22 00:52:41 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=244) DEBUG 04-22 00:52:41 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=244) DEBUG 04-22 00:52:41 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=244) DEBUG 04-22 00:52:41 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=244) DEBUG 04-22 00:52:41 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=244) INFO 04-22 00:52:41 [v1/worker/gpu_model_runner.py:4735] Starting to load model microsoft/phi-4... -(EngineCore pid=244) DEBUG 04-22 00:52:42 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=244) INFO 04-22 00:52:42 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=244) INFO 04-22 00:52:42 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=244) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=244) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=244) DEBUG 04-22 00:52:42 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=244) DEBUG 04-22 00:52:42 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=244) DEBUG 04-22 00:52:42 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 81, 'silu_and_mul': 40, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=244) DEBUG 04-22 00:52:42 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=244) DEBUG 04-22 00:52:42 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00005-of-00006.safetensors', 'model-00004-of-00006.safetensors', 'model-00002-of-00006.safetensors', 'model-00006-of-00006.safetensors', 'model-00001-of-00006.safetensors', 'model-00003-of-00006.safetensors']] -(EngineCore pid=244) Loading safetensors checkpoint shards: 0% Completed | 0/6 [00:00 -(APIServer pid=1) DEBUG 04-22 00:53:10 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=244) INFO 04-22 00:53:12 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/2556e36a7d/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=c01d08ff9f comp=e546579c48 code=d4902a9f99bbc2392c45e4b4dc801f4424a11306edb95a61851b1062db20b8c4 dir=/data/.cache/vllm/torch_compile_cache/2556e36a7d/rank_0_0/backbone -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=244) DEBUG 04-22 00:53:12 [compilation/backends.py:1074] Vllm config hash: c01d08ff9f -(EngineCore pid=244) INFO 04-22 00:53:12 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.97 s -(EngineCore pid=244) DEBUG 04-22 00:53:13 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=244) DEBUG 04-22 00:53:13 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=244) DEBUG 04-22 00:53:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=244) DEBUG 04-22 00:53:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms -(EngineCore pid=244) DEBUG 04-22 00:53:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms -(EngineCore pid=244) DEBUG 04-22 00:53:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=244) DEBUG 04-22 00:53:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=244) INFO 04-22 00:53:15 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=244) DEBUG 04-22 00:53:15 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/2556e36a7d/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=244) DEBUG 04-22 00:53:15 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=244) DEBUG 04-22 00:53:15 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(EngineCore pid=244) DEBUG 04-22 00:53:15 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms -(EngineCore pid=244) DEBUG 04-22 00:53:15 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=244) DEBUG 04-22 00:53:15 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=244) DEBUG 04-22 00:53:17 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/2556e36a7d/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=244) DEBUG 04-22 00:53:18 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=244) DEBUG 04-22 00:53:18 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms -(EngineCore pid=244) DEBUG 04-22 00:53:18 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms -(EngineCore pid=244) DEBUG 04-22 00:53:18 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=244) DEBUG 04-22 00:53:18 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(EngineCore pid=244) DEBUG 04-22 00:53:19 [compilation/backends.py:377] Store the 40-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_40', '/data/.cache/vllm/torch_compile_cache/2556e36a7d/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_40') -(EngineCore pid=244) INFO 04-22 00:53:19 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 6.27 s -(EngineCore pid=244) DEBUG 04-22 00:53:19 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/2556e36a7d/rank_0_0/backbone/computation_graph.py -(EngineCore pid=244) INFO 04-22 00:53:20 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/e77a412f263c8a32560bf00ea90369dc5c974de159e6eb0e5eb243025ca9ebe4/rank_0_0/model -(EngineCore pid=244) INFO 04-22 00:53:20 [compilation/monitor.py:48] torch.compile took 12.51 s in total -(APIServer pid=1) DEBUG 04-22 00:53:20 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=244) INFO 04-22 00:53:21 [compilation/monitor.py:76] Initial profiling/warmup run took 0.70 s -(EngineCore pid=244) INFO 04-22 00:53:26 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=244) DEBUG 04-22 00:53:26 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=244) INFO 04-22 00:53:26 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=244) DEBUG 04-22 00:53:26 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:53:26 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:53:26 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-22 00:53:26 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:53:26 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:53:26 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-22 00:53:26 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 156.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=244) DEBUG 04-22 00:53:26 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:53:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:53:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-22 00:53:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:53:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:53:27 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-22 00:53:27 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 328.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(EngineCore pid=244) DEBUG 04-22 00:53:27 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=244) INFO 04-22 00:53:27 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.81 GiB total -(EngineCore pid=244) DEBUG 04-22 00:53:27 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=244) DEBUG 04-22 00:53:27 [v1/worker/gpu_worker.py:430] Free memory after profiling: 50.78 GiB (total), 47.33 GiB (within requested) -(EngineCore pid=244) DEBUG 04-22 00:53:27 [v1/worker/gpu_worker.py:435] Memory profiling takes 19.87 seconds. Total non KV cache memory: 29.15GiB; torch peak memory increase: 1.52GiB; non-torch forward increase memory: 0.25GiB; weights memory: 27.39GiB. -(EngineCore pid=244) INFO 04-22 00:53:27 [v1/worker/gpu_worker.py:436] Available KV cache memory: 46.08 GiB -(EngineCore pid=244) INFO 04-22 00:53:27 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9602 to maintain the same effective KV cache size. -(EngineCore pid=244) INFO 04-22 00:53:27 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 241,568 tokens -(EngineCore pid=244) INFO 04-22 00:53:27 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 29.49x -(EngineCore pid=244) 2026-04-22 00:53:27,859 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=244) DEBUG 04-22 00:53:27 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) 2026-04-22 00:53:27,872 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=244) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:53:55 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:53:55 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 00:53:55 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:53:55 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 00:53:55 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 00:53:55 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model mistralai/Mistral-Small-3.1-24B-Instruct-2503 -(APIServer pid=1) INFO 04-22 00:53:55 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 00:53:55 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:53:55 [entrypoints/utils.py:233] non-default args: {'model_tag': 'mistralai/Mistral-Small-3.1-24B-Instruct-2503', 'model': 'mistralai/Mistral-Small-3.1-24B-Instruct-2503', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 00:53:55 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) INFO 04-22 00:53:55 [transformers_utils/config.py:288] Inferred from consolidated*.safetensors files torch.bfloat16 dtype. -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] Initialized config PretrainedConfig { -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "architectures": [ -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "PixtralForConditionalGeneration" -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] ], -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "dtype": "bfloat16", -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "image_token_index": 10, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "multimodal_projector_bias": false, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "projector_hidden_act": "gelu", -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "spatial_merge_size": 2, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "text_config": { -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "_name_or_path": "", -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "add_cross_attention": false, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "architectures": [ -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "MistralForCausalLM" -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] ], -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "bad_words_ids": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "begin_suppress_tokens": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "bos_token_id": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "chunk_size_feed_forward": 0, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "cross_attention_hidden_size": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "decoder_start_token_id": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "diversity_penalty": 0.0, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "do_sample": false, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "dtype": "bfloat16", -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "early_stopping": false, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "encoder_no_repeat_ngram_size": 0, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "eos_token_id": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "exponential_decay_length_penalty": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "finetuning_task": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "forced_bos_token_id": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "forced_eos_token_id": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "head_dim": 128, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "hidden_act": "silu", -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "hidden_size": 5120, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "id2label": { -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "0": "LABEL_0", -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "1": "LABEL_1" -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] }, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "intermediate_size": 32768, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "is_decoder": false, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "is_encoder_decoder": false, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "label2id": { -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "LABEL_0": 0, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "LABEL_1": 1 -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] }, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "length_penalty": 1.0, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "max_length": 20, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "max_position_embeddings": 131072, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "max_seq_len": 131072, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "min_length": 0, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "model_type": "", -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "no_repeat_ngram_size": 0, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "num_attention_heads": 32, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "num_beam_groups": 1, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "num_beams": 1, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "num_hidden_layers": 40, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "num_key_value_heads": 8, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "num_return_sequences": 1, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "output_attentions": false, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "output_hidden_states": false, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "output_scores": false, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "pad_token_id": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "prefix": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "problem_type": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "pruned_heads": {}, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "remove_invalid_values": false, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "repetition_penalty": 1.0, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "return_dict": true, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "return_dict_in_generate": false, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "rms_norm_eps": 1e-05, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "rope_theta": 1000000000.0, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "sep_token_id": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "suppress_tokens": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "task_specific_params": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "temperature": 1.0, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "tf_legacy_loss": false, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "tie_encoder_decoder": false, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "tie_word_embeddings": false, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "tokenizer_class": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "top_k": 50, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "top_p": 1.0, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "torchscript": false, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "typical_p": 1.0, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "use_bfloat16": false, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "vocab_size": 131072 -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] }, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "transformers_version": "4.57.6", -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "vision_config": { -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "_name_or_path": "", -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "adapter_bias": false, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "add_cross_attention": false, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "add_pre_mm_projector_layer_norm": true, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "architectures": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "bad_words_ids": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "begin_suppress_tokens": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "bos_token_id": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "chunk_size_feed_forward": 0, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "cross_attention_hidden_size": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "decoder_start_token_id": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "diversity_penalty": 0.0, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "do_sample": false, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "dtype": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "early_stopping": false, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "encoder_no_repeat_ngram_size": 0, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "eos_token_id": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "exponential_decay_length_penalty": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "finetuning_task": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "forced_bos_token_id": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "forced_eos_token_id": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "hidden_size": 1024, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "id2label": { -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "0": "LABEL_0", -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "1": "LABEL_1" -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] }, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "image_break_token_id": 12, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "image_end_token_id": 13, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "image_size": 1540, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "image_token_id": 10, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "intermediate_size": 4096, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "is_decoder": false, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "is_encoder_decoder": false, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "label2id": { -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "LABEL_0": 0, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "LABEL_1": 1 -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] }, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "length_penalty": 1.0, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "max_image_size": 1540, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "max_length": 20, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "min_length": 0, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "mm_projector_id": "patch_merge", -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "model_type": "", -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "no_repeat_ngram_size": 0, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "num_attention_heads": 16, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "num_beam_groups": 1, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "num_beams": 1, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "num_channels": 3, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "num_hidden_layers": 24, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "num_return_sequences": 1, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "output_attentions": false, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "output_hidden_states": false, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "output_scores": false, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "pad_token_id": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "patch_size": 14, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "prefix": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "problem_type": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "pruned_heads": {}, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "remove_invalid_values": false, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "repetition_penalty": 1.0, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "return_dict": true, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "return_dict_in_generate": false, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "rope_theta": 10000.0, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "sep_token_id": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "spatial_merge_size": 2, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "suppress_tokens": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "task_specific_params": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "temperature": 1.0, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "tf_legacy_loss": false, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "tie_encoder_decoder": false, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "tie_word_embeddings": true, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "tokenizer_class": null, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "top_k": 50, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "top_p": 1.0, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "torchscript": false, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "typical_p": 1.0, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "use_bfloat16": false -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] }, -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] "vision_feature_layer": -1 -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] } -(APIServer pid=1) DEBUG 04-22 00:53:55 [transformers_utils/configs/mistral.py:91] -(APIServer pid=1) DEBUG 04-22 00:53:56 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.pixtral.PixtralForConditionalGeneration from cache -(APIServer pid=1) DEBUG 04-22 00:53:56 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0011260 secs -(APIServer pid=1) INFO 04-22 00:53:56 [config/model.py:549] Resolved architecture: PixtralForConditionalGeneration -(APIServer pid=1) INFO 04-22 00:53:56 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 00:53:56 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 00:53:56 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 00:53:56 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 00:53:56 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 00:53:56 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 00:53:56 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 00:53:56 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 00:53:56 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 00:53:56 [tokenizers/registry.py:68] Loading MistralTokenizer for tokenizer_mode='mistral' -(APIServer pid=1) DEBUG 04-22 00:53:56 [renderers/registry.py:57] Loading MistralRenderer for renderer_mode='mistral' -(APIServer pid=1) DEBUG 04-22 00:53:57 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. -(APIServer pid=1) DEBUG 04-22 00:53:57 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 00:54:01 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:54:01 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:54:01 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:54:01 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:54:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:54:01 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:54:01 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:54:01 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:54:01 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:54:01 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:54:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:54:01 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:54:06 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=243) DEBUG 04-22 00:54:07 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 00:54:07 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=243) DEBUG 04-22 00:54:07 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/c09e0e00-f7cd-4413-841f-2dc61caf765d'], outputs=['ipc:///tmp/44f8ab81-724b-42a3-8f49-3523a3c4a6fb'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=243) DEBUG 04-22 00:54:07 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=243) DEBUG 04-22 00:54:07 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=243) DEBUG 04-22 00:54:07 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=243) DEBUG 04-22 00:54:07 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=243) DEBUG 04-22 00:54:07 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=243) INFO 04-22 00:54:07 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='mistralai/Mistral-Small-3.1-24B-Instruct-2503', speculative_config=None, tokenizer='mistralai/Mistral-Small-3.1-24B-Instruct-2503', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=mistralai/Mistral-Small-3.1-24B-Instruct-2503, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=243) DEBUG 04-22 00:54:08 [tokenizers/registry.py:68] Loading MistralTokenizer for tokenizer_mode='mistral' -(EngineCore pid=243) DEBUG 04-22 00:54:09 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.7.33:45449 backend=nccl -(EngineCore pid=243) INFO 04-22 00:54:09 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.7.33:45449 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=243) DEBUG 04-22 00:54:09 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=243) INFO 04-22 00:54:09 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=243) DEBUG 04-22 00:54:09 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776819249.5632694, auto_measure=True -(EngineCore pid=243) DEBUG 04-22 00:54:09 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=243) DEBUG 04-22 00:54:09 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=243) DEBUG 04-22 00:54:09 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=243) DEBUG 04-22 00:54:09 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=243) DEBUG 04-22 00:54:09 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=243) DEBUG 04-22 00:54:09 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=243) DEBUG 04-22 00:54:09 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=243) DEBUG 04-22 00:54:09 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. -(EngineCore pid=243) DEBUG 04-22 00:54:09 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=243) INFO 04-22 00:54:09 [v1/worker/gpu_model_runner.py:4735] Starting to load model mistralai/Mistral-Small-3.1-24B-Instruct-2503... -(EngineCore pid=243) INFO 04-22 00:54:10 [config/vllm.py:790] Asynchronous scheduling is enabled. -(EngineCore pid=243) DEBUG 04-22 00:54:10 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds', 't_cond'] -(EngineCore pid=243) DEBUG 04-22 00:54:10 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=243) INFO 04-22 00:54:10 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=243) INFO 04-22 00:54:10 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=243) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=243) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=243) DEBUG 04-22 00:54:10 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=243) DEBUG 04-22 00:54:10 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=243) DEBUG 04-22 00:54:10 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=243) DEBUG 04-22 00:54:10 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 81, 'silu_and_mul': 40, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=243) DEBUG 04-22 00:54:10 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=243) DEBUG 04-22 00:54:10 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 131, 'silu_and_mul': 40, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1, 'conv2d': 1}) -(EngineCore pid=243) DEBUG 04-22 00:54:10 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=243) DEBUG 04-22 00:54:10 [model_executor/model_loader/weight_utils.py:557] Using model weights format ['consolidated*.safetensors', '*.pt'] -(EngineCore pid=243) INFO 04-22 00:54:11 [model_executor/model_loader/weight_utils.py:625] No consolidated.safetensors.index.json found in remote. -(EngineCore pid=243) Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00 -(APIServer pid=1) DEBUG 04-22 00:54:57 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/mistral.py -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=243) INFO 04-22 00:55:02 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/d521e0d6c8/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=d4f29d6b5e comp=e546579c48 code=3efb0f576ccfea9fc4cca5c687c92f9011bee8476a24c13d13c993a50f3eaf70 dir=/data/.cache/vllm/torch_compile_cache/d521e0d6c8/rank_0_0/backbone -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/backends.py:1074] Vllm config hash: d4f29d6b5e -(EngineCore pid=243) INFO 04-22 00:55:02 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.85 s -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=243) DEBUG 04-22 00:55:02 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=243) DEBUG 04-22 00:55:03 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 00:55:03 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms -(EngineCore pid=243) DEBUG 04-22 00:55:03 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.7 ms -(EngineCore pid=243) DEBUG 04-22 00:55:03 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 00:55:03 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=243) INFO 04-22 00:55:05 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=243) DEBUG 04-22 00:55:05 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/d521e0d6c8/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=243) DEBUG 04-22 00:55:05 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 00:55:05 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(EngineCore pid=243) DEBUG 04-22 00:55:05 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms -(EngineCore pid=243) DEBUG 04-22 00:55:05 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 00:55:05 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(EngineCore pid=243) DEBUG 04-22 00:55:06 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/d521e0d6c8/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=243) DEBUG 04-22 00:55:07 [compilation/backends.py:377] Store the 2-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_2', '/data/.cache/vllm/torch_compile_cache/d521e0d6c8/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_2') -(APIServer pid=1) DEBUG 04-22 00:55:07 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=243) DEBUG 04-22 00:55:08 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 00:55:08 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms -(EngineCore pid=243) DEBUG 04-22 00:55:08 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms -(EngineCore pid=243) DEBUG 04-22 00:55:08 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 00:55:08 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(EngineCore pid=243) DEBUG 04-22 00:55:09 [compilation/backends.py:377] Store the 40-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_40', '/data/.cache/vllm/torch_compile_cache/d521e0d6c8/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_40') -(EngineCore pid=243) INFO 04-22 00:55:09 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 6.55 s -(EngineCore pid=243) DEBUG 04-22 00:55:09 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/d521e0d6c8/rank_0_0/backbone/computation_graph.py -(EngineCore pid=243) INFO 04-22 00:55:10 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/98ced68e0a54c070de5d01fdcea3f926c3687257f6bcfb752981b49fdfe92890/rank_0_0/model -(EngineCore pid=243) INFO 04-22 00:55:10 [compilation/monitor.py:48] torch.compile took 12.68 s in total -(EngineCore pid=243) INFO 04-22 00:55:11 [compilation/monitor.py:76] Initial profiling/warmup run took 0.90 s -(EngineCore pid=243) INFO 04-22 00:55:16 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=243) DEBUG 04-22 00:55:16 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=243) INFO 04-22 00:55:16 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=243) DEBUG 04-22 00:55:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:55:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:55:16 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 00:55:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:55:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:55:16 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 00:55:16 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 212.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=243) DEBUG 04-22 00:55:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:55:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:55:16 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 00:55:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:55:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:55:16 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 00:55:16 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 262.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=243) DEBUG 04-22 00:55:17 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=243) INFO 04-22 00:55:17 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.84 GiB total -(APIServer pid=1) DEBUG 04-22 00:55:17 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=243) DEBUG 04-22 00:55:17 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=243) DEBUG 04-22 00:55:17 [v1/worker/gpu_worker.py:430] Free memory after profiling: 33.42 GiB (total), 29.98 GiB (within requested) -(EngineCore pid=243) DEBUG 04-22 00:55:17 [v1/worker/gpu_worker.py:435] Memory profiling takes 21.11 seconds. Total non KV cache memory: 47.04GiB; torch peak memory increase: 2.03GiB; non-torch forward increase memory: 0.25GiB; weights memory: 44.76GiB. -(EngineCore pid=243) INFO 04-22 00:55:17 [v1/worker/gpu_worker.py:436] Available KV cache memory: 28.19 GiB -(EngineCore pid=243) INFO 04-22 00:55:17 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9606 to maintain the same effective KV cache size. -(EngineCore pid=243) INFO 04-22 00:55:17 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 184,752 tokens -(EngineCore pid=243) INFO 04-22 00:55:17 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 22.55x -(EngineCore pid=243) 2026-04-22 00:55:17,728 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=243) DEBUG 04-22 00:55:17 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) 2026-04-22 00:55:17,742 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=243) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:55:53 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:55:53 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 00:55:54 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:55:54 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 00:55:54 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 00:55:54 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model mistralai/Mixtral-8x7B-Instruct-v0.1 -(APIServer pid=1) INFO 04-22 00:55:54 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 00:55:54 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:55:54 [entrypoints/utils.py:233] non-default args: {'model_tag': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'model': 'mistralai/Mixtral-8x7B-Instruct-v0.1', 'max_model_len': 8192, 'tensor_parallel_size': 2, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 00:55:54 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 00:55:54 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.mixtral.MixtralForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 00:55:54 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0015548 secs -(APIServer pid=1) INFO 04-22 00:55:54 [config/model.py:549] Resolved architecture: MixtralForCausalLM -(APIServer pid=1) INFO 04-22 00:55:54 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 00:55:54 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 00:55:54 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 00:55:54 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 00:55:54 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 00:55:54 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 00:55:54 [config/parallel.py:743] Defaulting to use mp for distributed inference -(APIServer pid=1) INFO 04-22 00:55:54 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 00:55:54 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(APIServer pid=1) DEBUG 04-22 00:55:55 [config/vllm.py:1530] Max num batched tokens below allreduce-rms fusion threshold, allreduce-rms fusion will be enabled for all num_tokens. -(APIServer pid=1) INFO 04-22 00:55:55 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(APIServer pid=1) DEBUG 04-22 00:55:55 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 00:55:55 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:55:56 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:55:56 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 00:55:59 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:55:59 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:55:59 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:55:59 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:55:59 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:55:59 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:55:59 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:55:59 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:55:59 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:55:59 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:55:59 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:55:59 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:56:04 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=244) DEBUG 04-22 00:56:05 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 00:56:05 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=244) DEBUG 04-22 00:56:05 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/899f8561-87fc-4033-8fbd-da1739ae100a'], outputs=['ipc:///tmp/bb7b91ab-7c47-4747-b3f9-be76f2bb676b'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=244) DEBUG 04-22 00:56:05 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=244) DEBUG 04-22 00:56:05 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=244) DEBUG 04-22 00:56:05 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=244) DEBUG 04-22 00:56:05 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=244) DEBUG 04-22 00:56:05 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=244) INFO 04-22 00:56:05 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='mistralai/Mixtral-8x7B-Instruct-v0.1', speculative_config=None, tokenizer='mistralai/Mixtral-8x7B-Instruct-v0.1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=mistralai/Mixtral-8x7B-Instruct-v0.1, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=244) WARNING 04-22 00:56:05 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. -(EngineCore pid=244) INFO 04-22 00:56:05 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.131.7.83 (local), world_size=2, local_world_size=2 -(EngineCore pid=244) DEBUG 04-22 00:56:05 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/750eb9b3-2997-4601-a967-1c5de9df277c -(EngineCore pid=244) DEBUG 04-22 00:56:05 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1], buffer_handle=(2, 16777216, 10, 'psm_18257a27'), local_subscribe_addr='ipc:///tmp/750eb9b3-2997-4601-a967-1c5de9df277c', local_notify_addr='ipc:///tmp/0eda70c6-3b93-4953-8320-cb4bfddf9d8c', remote_subscribe_addr=None, remote_addr_ipv6=False) -DEBUG 04-22 00:56:09 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:56:09 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:56:09 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:56:09 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:56:09 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:56:09 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:56:09 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:56:09 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:56:09 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:56:09 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:56:09 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:56:09 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:56:09 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:56:09 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:56:09 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:56:09 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:56:09 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:56:09 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:56:09 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:56:09 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:56:09 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:56:09 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:56:09 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:56:09 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:56:14 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 00:56:14 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 00:56:15 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 00:56:15 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:56:15 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:56:15 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 00:56:15 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 00:56:15 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:56:15 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:56:15 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) DEBUG 04-22 00:56:15 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker pid=443) DEBUG 04-22 00:56:15 [distributed/parallel_state.py:1356] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:38677 backend=nccl -(Worker pid=443) INFO 04-22 00:56:15 [distributed/parallel_state.py:1400] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:38677 backend=nccl -(Worker pid=444) DEBUG 04-22 00:56:16 [distributed/parallel_state.py:1356] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:38677 backend=nccl -(Worker pid=444) INFO 04-22 00:56:16 [distributed/parallel_state.py:1400] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:38677 backend=nccl -[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -(Worker pid=444) DEBUG 04-22 00:56:16 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=443) DEBUG 04-22 00:56:16 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -(Worker pid=444) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=443) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=444) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=443) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=443) DEBUG 04-22 00:56:17 [utils/nccl.py:34] Found nccl from library libnccl.so.2 -(Worker pid=443) INFO 04-22 00:56:17 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 -(Worker pid=444) DEBUG 04-22 00:56:17 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=443) DEBUG 04-22 00:56:17 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=443) DEBUG 04-22 00:56:17 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/5ae62e70-3439-4050-81ce-b9208af6f8b5 -(Worker pid=443) DEBUG 04-22 00:56:17 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1], buffer_handle=(1, 4194304, 6, 'psm_5b23863a'), local_subscribe_addr='ipc:///tmp/5ae62e70-3439-4050-81ce-b9208af6f8b5', local_notify_addr='ipc:///tmp/99a778fb-5da6-4237-b33b-9bff59fd1ef3', remote_subscribe_addr=None, remote_addr_ipv6=False) -(Worker pid=444) DEBUG 04-22 00:56:17 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/5ae62e70-3439-4050-81ce-b9208af6f8b5 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -(Worker pid=443) INFO 04-22 00:56:18 [distributed/parallel_state.py:1716] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N/A -(Worker pid=444) DEBUG 04-22 00:56:18 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.18GiB, total_memory=79.19GiB, cuda_memory=2.01GiB, torch_memory=0.02GiB, non_torch_memory=1.99GiB, timestamp=1776819378.4357853, auto_measure=True -(Worker pid=444) DEBUG 04-22 00:56:18 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=443) DEBUG 04-22 00:56:18 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.18GiB, total_memory=79.19GiB, cuda_memory=2.01GiB, torch_memory=0.02GiB, non_torch_memory=1.99GiB, timestamp=1776819378.438148, auto_measure=True -(Worker pid=443) DEBUG 04-22 00:56:18 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=443) DEBUG 04-22 00:56:18 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=443) DEBUG 04-22 00:56:18 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=444) DEBUG 04-22 00:56:18 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=444) DEBUG 04-22 00:56:18 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=443) DEBUG 04-22 00:56:18 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=444) DEBUG 04-22 00:56:18 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=443) DEBUG 04-22 00:56:18 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=444) DEBUG 04-22 00:56:18 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=443) DEBUG 04-22 00:56:18 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(Worker pid=443) DEBUG 04-22 00:56:18 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=444) DEBUG 04-22 00:56:18 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=443) DEBUG 04-22 00:56:18 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(Worker pid=443) DEBUG 04-22 00:56:18 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker_TP0 pid=443) INFO 04-22 00:56:18 [v1/worker/gpu_model_runner.py:4735] Starting to load model mistralai/Mixtral-8x7B-Instruct-v0.1... -(Worker_TP1 pid=444) DEBUG 04-22 00:56:18 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker_TP0 pid=443) DEBUG 04-22 00:56:18 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(Worker_TP0 pid=443) INFO 04-22 00:56:18 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(Worker_TP0 pid=443) INFO 04-22 00:56:18 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(Worker_TP0 pid=443) INFO 04-22 00:56:18 [model_executor/.../oracle/unquantized.py:186] Using TRITON backend for Unquantized MoE -(Worker_TP0 pid=443) DEBUG 04-22 00:56:18 [model_executor/.../runner/default_moe_runner.py:240] Enabled separate cuda stream for MoE shared_experts -(Worker_TP0 pid=443) DEBUG 04-22 00:56:19 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP0 pid=443) DEBUG 04-22 00:56:19 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP0 pid=443) DEBUG 04-22 00:56:19 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'fused_moe': 32, 'unquantized_fused_moe': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP0 pid=443) DEBUG 04-22 00:56:19 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP1 pid=444) DEBUG 04-22 00:56:19 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP1 pid=444) DEBUG 04-22 00:56:19 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP1 pid=444) DEBUG 04-22 00:56:19 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'fused_moe': 32, 'unquantized_fused_moe': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP1 pid=444) DEBUG 04-22 00:56:19 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP1 pid=444) DEBUG 04-22 00:56:19 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00018-of-00019.safetensors', 'model-00008-of-00019.safetensors', 'model-00009-of-00019.safetensors', 'model-00017-of-00019.safetensors', 'model-00007-of-00019.safetensors', 'model-00005-of-00019.safetensors', 'model-00002-of-00019.safetensors', 'model-00010-of-00019.safetensors', 'model-00003-of-00019.safetensors', 'model-00012-of-00019.safetensors', 'model-00016-of-00019.safetensors', 'model-00011-of-00019.safetensors', 'model-00014-of-00019.safetensors', 'model-00004-of-00019.safetensors', 'model-00006-of-00019.safetensors', 'model-00015-of-00019.safetensors', 'model-00001-of-00019.safetensors', 'model-00019-of-00019.safetensors', 'model-00013-of-00019.safetensors']] -(Worker_TP0 pid=443) DEBUG 04-22 00:56:19 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00010-of-00019.safetensors', 'model-00004-of-00019.safetensors', 'model-00001-of-00019.safetensors', 'model-00016-of-00019.safetensors', 'model-00008-of-00019.safetensors', 'model-00012-of-00019.safetensors', 'model-00019-of-00019.safetensors', 'model-00003-of-00019.safetensors', 'model-00006-of-00019.safetensors', 'model-00018-of-00019.safetensors', 'model-00015-of-00019.safetensors', 'model-00013-of-00019.safetensors', 'model-00009-of-00019.safetensors', 'model-00011-of-00019.safetensors', 'model-00014-of-00019.safetensors', 'model-00007-of-00019.safetensors', 'model-00002-of-00019.safetensors', 'model-00017-of-00019.safetensors', 'model-00005-of-00019.safetensors']] -(Worker_TP0 pid=443) Loading safetensors checkpoint shards: 0% Completed | 0/19 [00:00 -(Worker_TP1 pid=444) DEBUG 04-22 00:57:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=444) DEBUG 04-22 00:57:25 [compilation/decorators.py:528] Start compiling function -(APIServer pid=1) DEBUG 04-22 00:57:25 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=244) DEBUG 04-22 00:57:26 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/forward_context.py -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/config.py -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/mixtral.py -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/forward_context.py -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/config.py -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/mixtral.py -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=cbf0405b61 comp=e546579c48 code=25313a6b8ebc50305714b2e0cfd04eb322a52ea698903c8ad5f901c77d4999c2 dir=/data/.cache/vllm/torch_compile_cache/6f8993cba3/rank_1_0/backbone -(Worker_TP0 pid=443) INFO 04-22 00:57:29 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/6f8993cba3/rank_0_0/backbone for vLLM's torch.compile -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=cbf0405b61 comp=e546579c48 code=25313a6b8ebc50305714b2e0cfd04eb322a52ea698903c8ad5f901c77d4999c2 dir=/data/.cache/vllm/torch_compile_cache/6f8993cba3/rank_0_0/backbone -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] Vllm config hash: cbf0405b61 -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/backends.py:1074] Vllm config hash: cbf0405b61 -(Worker_TP0 pid=443) INFO 04-22 00:57:29 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.32 s -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [compilation/.../fusion/allreduce_rms_fusion.py:765] Flashinfer max size: 64 MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: 8192 -(Worker_TP0 pid=443) INFO 04-22 00:57:29 [distributed/device_communicators/flashinfer_all_reduce.py:109] Auto-selected flashinfer allreduce backend: trtllm -(Worker_TP0 pid=443) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. -(Worker_TP0 pid=443) return func(*args, **kwargs) -(Worker_TP0 pid=443) DEBUG 04-22 00:57:29 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=0, max_token_num=8192, hidden_dim=4096, dtype=torch.bfloat16 -(Worker_TP1 pid=444) DEBUG 04-22 00:57:29 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=1, max_token_num=8192, hidden_dim=4096, dtype=torch.bfloat16 -(Worker_TP0 pid=443) INFO 04-22 00:57:29 [distributed/device_communicators/flashinfer_all_reduce.py:149] Initialized FlashInfer Allreduce norm fusion workspace with backend=trtllm -(Worker_TP0 pid=443) DEBUG 04-22 00:57:30 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(Worker_TP0 pid=443) DEBUG 04-22 00:57:30 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(Worker_TP0 pid=443) DEBUG 04-22 00:57:30 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=443) DEBUG 04-22 00:57:30 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP1 pid=444) DEBUG 04-22 00:57:30 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=444) DEBUG 04-22 00:57:30 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP0 pid=443) DEBUG 04-22 00:57:30 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns -(Worker_TP0 pid=443) DEBUG 04-22 00:57:30 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 23.5 ms -(Worker_TP0 pid=443) DEBUG 04-22 00:57:30 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=443) DEBUG 04-22 00:57:30 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes -(Worker_TP0 pid=443) DEBUG 04-22 00:57:30 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP1 pid=444) DEBUG 04-22 00:57:30 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns -(Worker_TP1 pid=444) DEBUG 04-22 00:57:30 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 23.7 ms -(Worker_TP1 pid=444) DEBUG 04-22 00:57:30 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=444) DEBUG 04-22 00:57:30 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes -(Worker_TP1 pid=444) DEBUG 04-22 00:57:30 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=443) INFO 04-22 00:57:33 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(Worker_TP0 pid=443) DEBUG 04-22 00:57:33 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/6f8993cba3/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(Worker_TP0 pid=443) DEBUG 04-22 00:57:33 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=443) DEBUG 04-22 00:57:33 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP0 pid=443) DEBUG 04-22 00:57:33 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP0 pid=443) DEBUG 04-22 00:57:33 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 36.3 ms -(Worker_TP0 pid=443) DEBUG 04-22 00:57:33 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP0 pid=443) DEBUG 04-22 00:57:33 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP0 pid=443) DEBUG 04-22 00:57:33 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP1 pid=444) DEBUG 04-22 00:57:33 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=444) DEBUG 04-22 00:57:33 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP1 pid=444) DEBUG 04-22 00:57:33 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP1 pid=444) DEBUG 04-22 00:57:33 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 35.8 ms -(Worker_TP1 pid=444) DEBUG 04-22 00:57:33 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP1 pid=444) DEBUG 04-22 00:57:33 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP1 pid=444) DEBUG 04-22 00:57:33 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP0 pid=443) DEBUG 04-22 00:57:34 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/6f8993cba3/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(Worker_TP0 pid=443) DEBUG 04-22 00:57:35 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=443) DEBUG 04-22 00:57:35 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(Worker_TP0 pid=443) DEBUG 04-22 00:57:35 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP0 pid=443) DEBUG 04-22 00:57:35 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 36.8 ms -(Worker_TP0 pid=443) DEBUG 04-22 00:57:35 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms -(Worker_TP0 pid=443) DEBUG 04-22 00:57:35 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP0 pid=443) DEBUG 04-22 00:57:35 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP0 pid=443) DEBUG 04-22 00:57:35 [compilation/backends.py:377] Store the 32-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_32', '/data/.cache/vllm/torch_compile_cache/6f8993cba3/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_32') -(Worker_TP0 pid=443) INFO 04-22 00:57:35 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.02 s -(Worker_TP0 pid=443) DEBUG 04-22 00:57:35 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/6f8993cba3/rank_0_0/backbone/computation_graph.py -(Worker_TP1 pid=444) DEBUG 04-22 00:57:35 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=444) DEBUG 04-22 00:57:35 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(Worker_TP1 pid=444) DEBUG 04-22 00:57:35 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP1 pid=444) DEBUG 04-22 00:57:35 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 36.9 ms -(Worker_TP1 pid=444) DEBUG 04-22 00:57:35 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms -(Worker_TP1 pid=444) DEBUG 04-22 00:57:35 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP1 pid=444) DEBUG 04-22 00:57:35 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(APIServer pid=1) DEBUG 04-22 00:57:35 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=443) INFO 04-22 00:57:36 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/de1a3d5a254a6970995e4bf5ae425faa50662dbbc60a8657443a824f90c07d5c/rank_0_0/model -(Worker_TP0 pid=443) INFO 04-22 00:57:36 [compilation/monitor.py:48] torch.compile took 11.13 s in total -(Worker_TP0 pid=443) INFO 04-22 00:57:36 [model_executor/.../fused_moe/fused_moe.py:1077] Using configuration from /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json for MoE layer. -(Worker_TP0 pid=443) INFO 04-22 00:57:37 [compilation/monitor.py:76] Initial profiling/warmup run took 1.40 s -(Worker_TP0 pid=443) INFO 04-22 00:57:43 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP0 pid=443) DEBUG 04-22 00:57:43 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP0 pid=443) INFO 04-22 00:57:43 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_TP1 pid=444) INFO 04-22 00:57:43 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP1 pid=444) DEBUG 04-22 00:57:43 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP1 pid=444) INFO 04-22 00:57:43 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_TP0 pid=443) DEBUG 04-22 00:57:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=444) DEBUG 04-22 00:57:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=443) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=443) DEBUG 04-22 00:57:44 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=444) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=444) DEBUG 04-22 00:57:44 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=443) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=444) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=443) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=444) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=443) DEBUG 04-22 00:57:44 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=444) DEBUG 04-22 00:57:44 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=444) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 84.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(Worker_TP1 pid=444) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=443) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 84.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(Worker_TP0 pid=443) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=444) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=444) DEBUG 04-22 00:57:44 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=443) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=443) DEBUG 04-22 00:57:44 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=444) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=443) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=444) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=444) DEBUG 04-22 00:57:44 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=443) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=443) DEBUG 04-22 00:57:44 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=444) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 136.00 MiB first-capture + (51-1) × 8.00 MiB per-graph -(Worker_TP1 pid=444) INFO 04-22 00:57:44 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses -(Worker_TP0 pid=443) DEBUG 04-22 00:57:44 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 136.00 MiB first-capture + (51-1) × 8.00 MiB per-graph -(Worker_TP0 pid=443) INFO 04-22 00:57:44 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses -(Worker_TP0 pid=443) DEBUG 04-22 00:57:45 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP0 pid=443) INFO 04-22 00:57:45 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.82 GiB total -(Worker_TP1 pid=444) DEBUG 04-22 00:57:45 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP1 pid=444) INFO 04-22 00:57:45 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.82 GiB total -(APIServer pid=1) DEBUG 04-22 00:57:45 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=443) DEBUG 04-22 00:57:45 [v1/worker/gpu_worker.py:424] Initial free memory: 77.18 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP0 pid=443) DEBUG 04-22 00:57:45 [v1/worker/gpu_worker.py:430] Free memory after profiling: 30.69 GiB (total), 28.73 GiB (within requested) -(Worker_TP0 pid=443) DEBUG 04-22 00:57:45 [v1/worker/gpu_worker.py:435] Memory profiling takes 20.58 seconds. Total non KV cache memory: 46.79GiB; torch peak memory increase: 1.21GiB; non-torch forward increase memory: 2.07GiB; weights memory: 43.51GiB. -(Worker_TP0 pid=443) INFO 04-22 00:57:45 [v1/worker/gpu_worker.py:436] Available KV cache memory: 28.44 GiB -(Worker_TP0 pid=443) INFO 04-22 00:57:45 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9603 to maintain the same effective KV cache size. -(Worker_TP0 pid=443) DEBUG 04-22 00:57:45 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=244) DEBUG 04-22 00:57:45 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=244) DEBUG 04-22 00:57:45 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=444) DEBUG 04-22 00:57:45 [v1/worker/gpu_worker.py:424] Initial free memory: 77.18 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP1 pid=444) DEBUG 04-22 00:57:45 [v1/worker/gpu_worker.py:430] Free memory after profiling: 30.69 GiB (total), 28.73 GiB (within requested) -(Worker_TP1 pid=444) DEBUG 04-22 00:57:45 [v1/worker/gpu_worker.py:435] Memory profiling takes 20.55 seconds. Total non KV cache memory: 46.79GiB; torch peak memory increase: 1.21GiB; non-torch forward increase memory: 2.07GiB; weights memory: 43.51GiB. -(Worker_TP1 pid=444) INFO 04-22 00:57:45 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9603 to maintain the same effective KV cache size. -(Worker_TP1 pid=444) DEBUG 04-22 00:57:45 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=244) DEBUG 04-22 00:57:45 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=244) INFO 04-22 00:57:45 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 465,936 tokens -(EngineCore pid=244) INFO 04-22 00:57:45 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 56.88x -(Worker_TP1 pid=444) DEBUG 04-22 00:57:45 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=443) DEBUG 04-22 00:57:45 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=443) 2026-04-22 00:57:45,967 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP1 pid=444) 2026-04-22 00:57:45,967 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP1 pid=444) DEBUG 04-22 00:57:45 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=443) DEBUG 04-22 00:57:45 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=444) 2026-04-22 00:57:45,993 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP0 pid=443) 2026-04-22 00:57:45,994 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP0 pid=443) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=244) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=244) DEBUG 04-22 00:57:57 [config/vllm.py:1530] Max num batched tokens below allreduce-rms fusion threshold, allreduce-rms fusion will be enabled for all num_tokens. -(EngineCore pid=244) INFO 04-22 00:57:57 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(Worker_TP0 pid=443) DEBUG 04-22 00:57:57 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=444) DEBUG 04-22 00:57:57 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=244) DEBUG 04-22 00:57:57 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=244) DEBUG 04-22 00:57:57 [v1/engine/core.py:1158] EngineCore waiting for work. -(EngineCore pid=244) DEBUG 04-22 00:57:57 [v1/engine/core.py:1158] EngineCore waiting for work. -(APIServer pid=1) INFO 04-22 00:57:57 [entrypoints/openai/api_server.py:590] Supported tasks: ['generate'] -(APIServer pid=1) DEBUG 04-22 00:57:58 [renderers/base.py:197] Warming up chat template processing... -(APIServer pid=1) DEBUG 04-22 00:57:58 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e81d16-5a939a2440aa6b226ae3b22d;c34969b7-02e8-49b8-bf74-f77fa7c257a1) -(APIServer pid=1) DEBUG 04-22 00:57:58 [transformers_utils/repo_utils.py:243] -(APIServer pid=1) DEBUG 04-22 00:57:58 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1/resolve/main/processor_config.json. -(APIServer pid=1) DEBUG 04-22 00:57:58 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e81d16-16d7d3f56520faa20bc74780;0f1891c1-6bf7-452a-a1b3-3d65bb07b6c6) -(APIServer pid=1) DEBUG 04-22 00:57:58 [transformers_utils/repo_utils.py:243] -(APIServer pid=1) DEBUG 04-22 00:57:58 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1/resolve/main/preprocessor_config.json. -(Worker_TP0 pid=443) DEBUG 04-22 00:57:58 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=444) DEBUG 04-22 00:57:58 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(APIServer pid=1) INFO 04-22 00:57:58 [renderers/hf.py:314] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this. -(APIServer pid=1) DEBUG 04-22 00:57:58 [renderers/base.py:203] Chat template warmup completed in 0.592s -(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/openai/api_server.py:594] Starting vLLM server on http://0.0.0.0:8000 -(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:37] Available routes are: -(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /openapi.json, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /docs, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /docs/oauth2-redirect, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /redoc, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /tokenize, Methods: POST -(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /detokenize, Methods: POST -(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /load, Methods: GET -(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /version, Methods: GET -(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /health, Methods: GET -(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /metrics, Methods: GET -(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /v1/models, Methods: GET -(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /ping, Methods: GET -(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /ping, Methods: POST -(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /invocations, Methods: POST -(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /v1/chat/completions, Methods: POST -(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST -(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /v1/responses, Methods: POST -(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET -(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST -(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /v1/completions, Methods: POST -(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /v1/messages, Methods: POST -(APIServer pid=1) INFO 04-22 00:57:58 [entrypoints/launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST -(APIServer pid=1) INFO 04-22 00:57:59 [entrypoints/launcher.py:46] Route: /inference/v1/generate, Methods: POST -(APIServer pid=1) INFO 04-22 00:57:59 [entrypoints/launcher.py:46] Route: /scale_elastic_ep, Methods: POST -(APIServer pid=1) INFO 04-22 00:57:59 [entrypoints/launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST -(APIServer pid=1) INFO 04-22 00:57:59 [entrypoints/launcher.py:46] Route: /v1/chat/completions/render, Methods: POST -(APIServer pid=1) INFO 04-22 00:57:59 [entrypoints/launcher.py:46] Route: /v1/completions/render, Methods: POST -(APIServer pid=1) INFO: Started server process [1] -(APIServer pid=1) INFO: Waiting for application startup. -(APIServer pid=1) INFO: Application startup complete. -(APIServer pid=1) DEBUG 04-22 00:58:03 [v1/engine/async_llm.py:875] Called check_health. -(APIServer pid=1) INFO: 10.131.6.2:57516 - "GET /health HTTP/1.1" 200 OK diff --git a/accuracy/results/v0.19.0/logs/moonshotai-kimi-dev-72b--h100-80gb--tp2pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/moonshotai-kimi-dev-72b--h100-80gb--tp2pp1dp1--8192.log deleted file mode 100644 index 6d2e1de1..00000000 --- a/accuracy/results/v0.19.0/logs/moonshotai-kimi-dev-72b--h100-80gb--tp2pp1dp1--8192.log +++ /dev/null @@ -1,1565 +0,0 @@ -DEBUG 04-21 23:59:08 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-21 23:59:08 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-21 23:59:08 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-21 23:59:08 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-21 23:59:08 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-21 23:59:08 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-21 23:59:08 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-21 23:59:08 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-21 23:59:08 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-21 23:59:08 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-21 23:59:08 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-21 23:59:08 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-21 23:59:13 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-21 23:59:14 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' -DEBUG 04-21 23:59:14 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-21 23:59:14 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-21 23:59:14 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-21 23:59:14 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-21 23:59:14 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-21 23:59:14 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-21 23:59:14 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-21 23:59:14 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model moonshotai/Kimi-Dev-72B -(APIServer pid=1) INFO 04-21 23:59:14 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-21 23:59:14 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-21 23:59:14 [entrypoints/utils.py:233] non-default args: {'model_tag': 'moonshotai/Kimi-Dev-72B', 'model': 'moonshotai/Kimi-Dev-72B', 'trust_remote_code': True, 'max_model_len': 8192, 'tensor_parallel_size': 2, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-21 23:59:14 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-21 23:59:15 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen2.Qwen2ForCausalLM from cache -(APIServer pid=1) DEBUG 04-21 23:59:15 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0011903 secs -(APIServer pid=1) INFO 04-21 23:59:15 [config/model.py:549] Resolved architecture: Qwen2ForCausalLM -(APIServer pid=1) INFO 04-21 23:59:15 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-21 23:59:15 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-21 23:59:15 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-21 23:59:15 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-21 23:59:15 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-21 23:59:15 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-21 23:59:15 [config/parallel.py:743] Defaulting to use mp for distributed inference -(APIServer pid=1) INFO 04-21 23:59:15 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-21 23:59:15 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(APIServer pid=1) INFO 04-21 23:59:16 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(APIServer pid=1) DEBUG 04-21 23:59:16 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-21 23:59:16 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-21 23:59:16 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-21 23:59:16 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-21 23:59:20 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-21 23:59:20 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-21 23:59:20 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-21 23:59:20 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-21 23:59:20 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-21 23:59:20 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-21 23:59:20 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-21 23:59:20 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-21 23:59:20 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-21 23:59:20 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-21 23:59:20 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-21 23:59:20 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-21 23:59:25 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=243) DEBUG 04-21 23:59:26 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-21 23:59:26 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=243) DEBUG 04-21 23:59:26 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/a6342252-7400-4e34-84be-903fde2e7e07'], outputs=['ipc:///tmp/4a1fa813-4215-4d18-96d1-bf099cceb399'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=243) DEBUG 04-21 23:59:26 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=243) DEBUG 04-21 23:59:26 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=243) DEBUG 04-21 23:59:26 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=243) DEBUG 04-21 23:59:26 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=243) DEBUG 04-21 23:59:26 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=243) INFO 04-21 23:59:26 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='moonshotai/Kimi-Dev-72B', speculative_config=None, tokenizer='moonshotai/Kimi-Dev-72B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=moonshotai/Kimi-Dev-72B, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [4096, 8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=243) WARNING 04-21 23:59:26 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. -(EngineCore pid=243) INFO 04-21 23:59:26 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.128.4.183 (local), world_size=2, local_world_size=2 -(EngineCore pid=243) DEBUG 04-21 23:59:26 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/cd9cd408-5342-4764-8f72-5cff1c2769cc -(EngineCore pid=243) DEBUG 04-21 23:59:26 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1], buffer_handle=(2, 16777216, 10, 'psm_1004733a'), local_subscribe_addr='ipc:///tmp/cd9cd408-5342-4764-8f72-5cff1c2769cc', local_notify_addr='ipc:///tmp/0e466130-897f-4cf5-a03f-2da8be229250', remote_subscribe_addr=None, remote_addr_ipv6=False) -DEBUG 04-21 23:59:30 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-21 23:59:30 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-21 23:59:30 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-21 23:59:30 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-21 23:59:30 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-21 23:59:30 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-21 23:59:30 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-21 23:59:30 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-21 23:59:30 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-21 23:59:30 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-21 23:59:30 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-21 23:59:30 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-21 23:59:30 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-21 23:59:30 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-21 23:59:30 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-21 23:59:30 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-21 23:59:30 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-21 23:59:30 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-21 23:59:30 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-21 23:59:30 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-21 23:59:30 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-21 23:59:30 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-21 23:59:30 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-21 23:59:30 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-21 23:59:35 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-21 23:59:35 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-21 23:59:36 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-21 23:59:36 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-21 23:59:36 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-21 23:59:36 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) DEBUG 04-21 23:59:36 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -DEBUG 04-21 23:59:36 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-21 23:59:36 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-21 23:59:36 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-21 23:59:36 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(Worker pid=443) DEBUG 04-21 23:59:36 [distributed/parallel_state.py:1356] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:57857 backend=nccl -(Worker pid=443) INFO 04-21 23:59:36 [distributed/parallel_state.py:1400] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:57857 backend=nccl -(Worker pid=442) DEBUG 04-21 23:59:37 [distributed/parallel_state.py:1356] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:57857 backend=nccl -(Worker pid=442) INFO 04-21 23:59:37 [distributed/parallel_state.py:1400] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:57857 backend=nccl -[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -(Worker pid=442) DEBUG 04-21 23:59:37 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=443) DEBUG 04-21 23:59:37 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -(Worker pid=442) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=442) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=443) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=443) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=442) DEBUG 04-21 23:59:38 [utils/nccl.py:34] Found nccl from library libnccl.so.2 -(Worker pid=442) INFO 04-21 23:59:38 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 -(Worker pid=443) DEBUG 04-21 23:59:38 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=442) DEBUG 04-21 23:59:38 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=442) DEBUG 04-21 23:59:38 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/8bdb0334-b32f-4726-8ef3-de6647788486 -(Worker pid=442) DEBUG 04-21 23:59:38 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1], buffer_handle=(1, 4194304, 6, 'psm_8c80a23b'), local_subscribe_addr='ipc:///tmp/8bdb0334-b32f-4726-8ef3-de6647788486', local_notify_addr='ipc:///tmp/7d634166-ae36-4d6c-909d-eaddce568a1e', remote_subscribe_addr=None, remote_addr_ipv6=False) -(Worker pid=443) DEBUG 04-21 23:59:38 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/8bdb0334-b32f-4726-8ef3-de6647788486 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(Worker pid=442) INFO 04-21 23:59:38 [distributed/parallel_state.py:1716] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(Worker pid=442) DEBUG 04-21 23:59:39 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.66GiB, total_memory=79.19GiB, cuda_memory=1.53GiB, torch_memory=0.02GiB, non_torch_memory=1.51GiB, timestamp=1776815979.1975927, auto_measure=True -(Worker pid=442) DEBUG 04-21 23:59:39 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=443) DEBUG 04-21 23:59:39 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.66GiB, total_memory=79.19GiB, cuda_memory=1.53GiB, torch_memory=0.02GiB, non_torch_memory=1.51GiB, timestamp=1776815979.206957, auto_measure=True -(Worker pid=443) DEBUG 04-21 23:59:39 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=442) DEBUG 04-21 23:59:39 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=442) DEBUG 04-21 23:59:39 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=443) DEBUG 04-21 23:59:39 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=443) DEBUG 04-21 23:59:39 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=442) DEBUG 04-21 23:59:39 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=443) DEBUG 04-21 23:59:39 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=442) DEBUG 04-21 23:59:39 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=443) DEBUG 04-21 23:59:39 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=442) DEBUG 04-21 23:59:39 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(Worker pid=442) DEBUG 04-21 23:59:39 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=443) DEBUG 04-21 23:59:39 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=442) DEBUG 04-21 23:59:39 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(Worker_TP0 pid=442) INFO 04-21 23:59:39 [v1/worker/gpu_model_runner.py:4735] Starting to load model moonshotai/Kimi-Dev-72B... -(Worker_TP0 pid=442) DEBUG 04-21 23:59:39 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(Worker_TP0 pid=442) INFO 04-21 23:59:39 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(Worker_TP0 pid=442) INFO 04-21 23:59:39 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(Worker_TP1 pid=443) DEBUG 04-21 23:59:40 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP1 pid=443) DEBUG 04-21 23:59:40 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP1 pid=443) DEBUG 04-21 23:59:40 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP1 pid=443) DEBUG 04-21 23:59:40 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP0 pid=442) DEBUG 04-21 23:59:40 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP0 pid=442) DEBUG 04-21 23:59:40 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP0 pid=442) DEBUG 04-21 23:59:40 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP0 pid=442) DEBUG 04-21 23:59:40 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP1 pid=443) DEBUG 04-21 23:59:40 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-79-of-80.safetensors', 'model-27-of-80.safetensors', 'model-25-of-80.safetensors', 'model-46-of-80.safetensors', 'model-17-of-80.safetensors', 'model-8-of-80.safetensors', 'model-77-of-80.safetensors', 'model-41-of-80.safetensors', 'model-31-of-80.safetensors', 'model-39-of-80.safetensors', 'model-48-of-80.safetensors', 'model-47-of-80.safetensors', 'model-18-of-80.safetensors', 'model-66-of-80.safetensors', 'model-68-of-80.safetensors', 'model-21-of-80.safetensors', 'model-28-of-80.safetensors', 'model-62-of-80.safetensors', 'model-12-of-80.safetensors', 'model-51-of-80.safetensors', 'model-15-of-80.safetensors', 'model-34-of-80.safetensors', 'model-38-of-80.safetensors', 'model-61-of-80.safetensors', 'model-29-of-80.safetensors', 'model-35-of-80.safetensors', 'model-10-of-80.safetensors', 'model-54-of-80.safetensors', 'model-78-of-80.safetensors', 'model-55-of-80.safetensors', 'model-65-of-80.safetensors', 'model-24-of-80.safetensors', 'model-11-of-80.safetensors', 'model-53-of-80.safetensors', 'model-4-of-80.safetensors', 'model-2-of-80.safetensors', 'model-32-of-80.safetensors', 'model-74-of-80.safetensors', 'model-70-of-80.safetensors', 'model-14-of-80.safetensors', 'model-5-of-80.safetensors', 'model-43-of-80.safetensors', 'model-22-of-80.safetensors', 'model-72-of-80.safetensors', 'model-60-of-80.safetensors', 'model-33-of-80.safetensors', 'model-64-of-80.safetensors', 'model-42-of-80.safetensors', 'model-13-of-80.safetensors', 'model-75-of-80.safetensors', 'model-9-of-80.safetensors', 'model-56-of-80.safetensors', 'model-45-of-80.safetensors', 'model-44-of-80.safetensors', 'model-20-of-80.safetensors', 'model-71-of-80.safetensors', 'model-57-of-80.safetensors', 'model-76-of-80.safetensors', 'model-40-of-80.safetensors', 'model-19-of-80.safetensors', 'model-1-of-80.safetensors', 'model-36-of-80.safetensors', 'model-58-of-80.safetensors', 'model-7-of-80.safetensors', 'model-16-of-80.safetensors', 'model-30-of-80.safetensors', 'model-59-of-80.safetensors', 'model-67-of-80.safetensors', 'model-80-of-80.safetensors', 'model-3-of-80.safetensors', 'model-6-of-80.safetensors', 'model-52-of-80.safetensors', 'model-26-of-80.safetensors', 'model-37-of-80.safetensors', 'model-49-of-80.safetensors', 'model-23-of-80.safetensors', 'model-73-of-80.safetensors', 'model-63-of-80.safetensors', 'model-69-of-80.safetensors', 'model-50-of-80.safetensors']] -(Worker_TP0 pid=442) DEBUG 04-21 23:59:40 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-65-of-80.safetensors', 'model-40-of-80.safetensors', 'model-66-of-80.safetensors', 'model-9-of-80.safetensors', 'model-68-of-80.safetensors', 'model-27-of-80.safetensors', 'model-15-of-80.safetensors', 'model-11-of-80.safetensors', 'model-76-of-80.safetensors', 'model-49-of-80.safetensors', 'model-21-of-80.safetensors', 'model-13-of-80.safetensors', 'model-73-of-80.safetensors', 'model-28-of-80.safetensors', 'model-52-of-80.safetensors', 'model-25-of-80.safetensors', 'model-69-of-80.safetensors', 'model-71-of-80.safetensors', 'model-75-of-80.safetensors', 'model-30-of-80.safetensors', 'model-45-of-80.safetensors', 'model-32-of-80.safetensors', 'model-64-of-80.safetensors', 'model-37-of-80.safetensors', 'model-1-of-80.safetensors', 'model-14-of-80.safetensors', 'model-60-of-80.safetensors', 'model-36-of-80.safetensors', 'model-39-of-80.safetensors', 'model-48-of-80.safetensors', 'model-74-of-80.safetensors', 'model-62-of-80.safetensors', 'model-77-of-80.safetensors', 'model-72-of-80.safetensors', 'model-58-of-80.safetensors', 'model-67-of-80.safetensors', 'model-19-of-80.safetensors', 'model-23-of-80.safetensors', 'model-59-of-80.safetensors', 'model-24-of-80.safetensors', 'model-29-of-80.safetensors', 'model-18-of-80.safetensors', 'model-4-of-80.safetensors', 'model-33-of-80.safetensors', 'model-5-of-80.safetensors', 'model-78-of-80.safetensors', 'model-22-of-80.safetensors', 'model-10-of-80.safetensors', 'model-55-of-80.safetensors', 'model-16-of-80.safetensors', 'model-3-of-80.safetensors', 'model-43-of-80.safetensors', 'model-31-of-80.safetensors', 'model-7-of-80.safetensors', 'model-41-of-80.safetensors', 'model-53-of-80.safetensors', 'model-8-of-80.safetensors', 'model-80-of-80.safetensors', 'model-35-of-80.safetensors', 'model-56-of-80.safetensors', 'model-2-of-80.safetensors', 'model-17-of-80.safetensors', 'model-46-of-80.safetensors', 'model-34-of-80.safetensors', 'model-54-of-80.safetensors', 'model-79-of-80.safetensors', 'model-26-of-80.safetensors', 'model-47-of-80.safetensors', 'model-50-of-80.safetensors', 'model-42-of-80.safetensors', 'model-12-of-80.safetensors', 'model-6-of-80.safetensors', 'model-63-of-80.safetensors', 'model-57-of-80.safetensors', 'model-44-of-80.safetensors', 'model-20-of-80.safetensors', 'model-61-of-80.safetensors', 'model-51-of-80.safetensors', 'model-70-of-80.safetensors', 'model-38-of-80.safetensors']] -(Worker_TP0 pid=442) Loading safetensors checkpoint shards: 0% Completed | 0/80 [00:00 -(Worker_TP1 pid=443) DEBUG 04-22 00:01:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=443) DEBUG 04-22 00:01:25 [compilation/decorators.py:528] Start compiling function -(EngineCore pid=243) DEBUG 04-22 00:01:26 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(APIServer pid=1) DEBUG 04-22 00:01:26 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP0 pid=442) INFO 04-22 00:01:34 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/0288147631/rank_0_0/backbone for vLLM's torch.compile -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=defe415cac comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/0288147631/rank_0_0/backbone -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] Vllm config hash: defe415cac -(Worker_TP0 pid=442) INFO 04-22 00:01:34 [compilation/backends.py:1111] Dynamo bytecode transform time: 8.75 s -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [compilation/.../fusion/allreduce_rms_fusion.py:765] Flashinfer max size: 64 MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: 4096 -(Worker_TP0 pid=442) INFO 04-22 00:01:34 [distributed/device_communicators/flashinfer_all_reduce.py:109] Auto-selected flashinfer allreduce backend: trtllm -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=defe415cac comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/0288147631/rank_1_0/backbone -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [compilation/backends.py:1074] Vllm config hash: defe415cac -(Worker_TP0 pid=442) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. -(Worker_TP0 pid=442) return func(*args, **kwargs) -(Worker_TP0 pid=442) DEBUG 04-22 00:01:34 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=0, max_token_num=4096, hidden_dim=8192, dtype=torch.bfloat16 -(Worker_TP1 pid=443) DEBUG 04-22 00:01:34 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=1, max_token_num=4096, hidden_dim=8192, dtype=torch.bfloat16 -(Worker_TP0 pid=442) INFO 04-22 00:01:34 [distributed/device_communicators/flashinfer_all_reduce.py:149] Initialized FlashInfer Allreduce norm fusion workspace with backend=trtllm -(Worker_TP0 pid=442) DEBUG 04-22 00:01:35 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 4096), (4097, 8192)] -(Worker_TP0 pid=442) DEBUG 04-22 00:01:35 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(Worker_TP0 pid=442) DEBUG 04-22 00:01:35 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=442) DEBUG 04-22 00:01:35 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:01:36 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns -(Worker_TP0 pid=442) DEBUG 04-22 00:01:36 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 34.6 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:01:36 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:01:36 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes -(Worker_TP0 pid=442) DEBUG 04-22 00:01:36 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:01:36 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=443) DEBUG 04-22 00:01:36 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:01:36 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns -(Worker_TP1 pid=443) DEBUG 04-22 00:01:36 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 34.9 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:01:36 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:01:36 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes -(Worker_TP1 pid=443) DEBUG 04-22 00:01:36 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(APIServer pid=1) DEBUG 04-22 00:01:36 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=442) INFO 04-22 00:01:38 [compilation/backends.py:372] Cache the graph of compile range (1, 4096) for later use -(Worker_TP0 pid=442) DEBUG 04-22 00:01:38 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 4096) from inductor_standalone via handle ('artifact_compile_range_1_4096_subgraph_0', '/data/.cache/vllm/torch_compile_cache/0288147631/rank_0_0/backbone/artifact_compile_range_1_4096_subgraph_0') -(Worker_TP0 pid=442) DEBUG 04-22 00:01:38 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=442) DEBUG 04-22 00:01:38 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:01:38 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) -(Worker_TP0 pid=442) DEBUG 04-22 00:01:38 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:01:38 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=442) DEBUG 04-22 00:01:38 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:01:38 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=443) DEBUG 04-22 00:01:38 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:01:38 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) -(Worker_TP1 pid=443) DEBUG 04-22 00:01:38 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:01:38 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=443) DEBUG 04-22 00:01:38 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP0 pid=442) INFO 04-22 00:01:39 [compilation/backends.py:372] Cache the graph of compile range (4097, 8192) for later use -(Worker_TP0 pid=442) DEBUG 04-22 00:01:39 [compilation/backends.py:377] Store the 0-th graph for compile range(4097, 8192) from inductor_standalone via handle ('artifact_compile_range_4097_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/0288147631/rank_0_0/backbone/artifact_compile_range_4097_8192_subgraph_0') -(Worker_TP0 pid=442) DEBUG 04-22 00:01:39 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=442) DEBUG 04-22 00:01:39 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:01:39 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP0 pid=442) DEBUG 04-22 00:01:39 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 61.3 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:01:39 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:01:39 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP0 pid=442) DEBUG 04-22 00:01:39 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:01:40 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=443) DEBUG 04-22 00:01:40 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:01:40 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP1 pid=443) DEBUG 04-22 00:01:40 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 62.4 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:01:40 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:01:40 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP1 pid=443) DEBUG 04-22 00:01:40 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:01:40 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 4096) from inductor_standalone via handle ('artifact_compile_range_1_4096_subgraph_1', '/data/.cache/vllm/torch_compile_cache/0288147631/rank_0_0/backbone/artifact_compile_range_1_4096_subgraph_1') -(Worker_TP0 pid=442) DEBUG 04-22 00:01:41 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=442) DEBUG 04-22 00:01:41 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:01:41 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) -(Worker_TP0 pid=442) DEBUG 04-22 00:01:41 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:01:41 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=442) DEBUG 04-22 00:01:41 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:01:41 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=443) DEBUG 04-22 00:01:41 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:01:41 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) -(Worker_TP1 pid=443) DEBUG 04-22 00:01:41 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:01:41 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=443) DEBUG 04-22 00:01:41 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:01:42 [compilation/backends.py:377] Store the 1-th graph for compile range(4097, 8192) from inductor_standalone via handle ('artifact_compile_range_4097_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/0288147631/rank_0_0/backbone/artifact_compile_range_4097_8192_subgraph_1') -(APIServer pid=1) DEBUG 04-22 00:01:46 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=442) DEBUG 04-22 00:01:48 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=442) DEBUG 04-22 00:01:48 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:01:48 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP0 pid=442) DEBUG 04-22 00:01:48 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 61.5 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:01:48 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:01:48 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP0 pid=442) DEBUG 04-22 00:01:48 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:01:48 [compilation/backends.py:377] Store the 80-th graph for compile range(1, 4096) from inductor_standalone via handle ('artifact_compile_range_1_4096_subgraph_80', '/data/.cache/vllm/torch_compile_cache/0288147631/rank_0_0/backbone/artifact_compile_range_1_4096_subgraph_80') -(Worker_TP0 pid=442) INFO 04-22 00:01:48 [compilation/backends.py:390] Compiling a graph for compile range (1, 4096) takes 9.96 s -(Worker_TP0 pid=442) DEBUG 04-22 00:01:48 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=442) DEBUG 04-22 00:01:48 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:01:48 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) -(Worker_TP0 pid=442) DEBUG 04-22 00:01:48 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:01:48 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=442) DEBUG 04-22 00:01:48 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:01:48 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=443) DEBUG 04-22 00:01:48 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:01:48 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP1 pid=443) DEBUG 04-22 00:01:48 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 62.8 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:01:48 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:01:48 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP1 pid=443) DEBUG 04-22 00:01:48 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:01:49 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=443) DEBUG 04-22 00:01:49 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:01:49 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) -(Worker_TP1 pid=443) DEBUG 04-22 00:01:49 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP1 pid=443) DEBUG 04-22 00:01:49 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=443) DEBUG 04-22 00:01:49 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(Worker_TP0 pid=442) DEBUG 04-22 00:01:49 [compilation/backends.py:377] Store the 80-th graph for compile range(4097, 8192) from inductor_standalone via handle ('artifact_compile_range_4097_8192_subgraph_80', '/data/.cache/vllm/torch_compile_cache/0288147631/rank_0_0/backbone/artifact_compile_range_4097_8192_subgraph_80') -(Worker_TP0 pid=442) INFO 04-22 00:01:49 [compilation/backends.py:390] Compiling a graph for compile range (4097, 8192) takes 10.85 s -(Worker_TP0 pid=442) DEBUG 04-22 00:01:49 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/0288147631/rank_0_0/backbone/computation_graph.py -(Worker_TP0 pid=442) INFO 04-22 00:01:52 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/d910a456355fd0e3a8ef08216e2a64de8c7290dd49f50f279a253f6d2d652529/rank_0_0/model -(Worker_TP0 pid=442) INFO 04-22 00:01:52 [compilation/monitor.py:48] torch.compile took 26.70 s in total -(Worker_TP0 pid=442) INFO 04-22 00:01:53 [compilation/monitor.py:76] Initial profiling/warmup run took 1.95 s -(APIServer pid=1) DEBUG 04-22 00:01:56 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP1 pid=443) INFO 04-22 00:01:59 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP0 pid=442) INFO 04-22 00:01:59 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP1 pid=443) DEBUG 04-22 00:01:59 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP1 pid=443) INFO 04-22 00:01:59 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_TP0 pid=442) DEBUG 04-22 00:01:59 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP0 pid=442) INFO 04-22 00:01:59 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_TP1 pid=443) DEBUG 04-22 00:01:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) DEBUG 04-22 00:01:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=443) DEBUG 04-22 00:01:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) DEBUG 04-22 00:01:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=443) DEBUG 04-22 00:01:59 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=442) DEBUG 04-22 00:01:59 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=442) DEBUG 04-22 00:02:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=443) DEBUG 04-22 00:02:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) DEBUG 04-22 00:02:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) DEBUG 04-22 00:02:00 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=443) DEBUG 04-22 00:02:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=443) DEBUG 04-22 00:02:00 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=442) DEBUG 04-22 00:02:00 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 152.00 MiB first-capture + (51-1) × 18.00 MiB per-graph -(Worker_TP0 pid=442) DEBUG 04-22 00:02:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=443) DEBUG 04-22 00:02:00 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 152.00 MiB first-capture + (51-1) × 18.00 MiB per-graph -(Worker_TP1 pid=443) DEBUG 04-22 00:02:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) DEBUG 04-22 00:02:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) DEBUG 04-22 00:02:00 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=443) DEBUG 04-22 00:02:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=443) DEBUG 04-22 00:02:00 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=442) DEBUG 04-22 00:02:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=443) DEBUG 04-22 00:02:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) DEBUG 04-22 00:02:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) DEBUG 04-22 00:02:00 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=443) DEBUG 04-22 00:02:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=443) DEBUG 04-22 00:02:00 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=442) DEBUG 04-22 00:02:00 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 266.00 MiB first-capture + (51-1) × 10.00 MiB per-graph -(Worker_TP0 pid=442) INFO 04-22 00:02:00 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses -(Worker_TP1 pid=443) DEBUG 04-22 00:02:00 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 266.00 MiB first-capture + (51-1) × 10.00 MiB per-graph -(Worker_TP1 pid=443) INFO 04-22 00:02:00 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses -(Worker_TP0 pid=442) DEBUG 04-22 00:02:01 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP0 pid=442) INFO 04-22 00:02:01 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.63 GiB total -(Worker_TP1 pid=443) DEBUG 04-22 00:02:01 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP1 pid=443) INFO 04-22 00:02:01 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.63 GiB total -(Worker_TP1 pid=443) DEBUG 04-22 00:02:01 [v1/worker/gpu_worker.py:424] Initial free memory: 77.66 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP1 pid=443) DEBUG 04-22 00:02:01 [v1/worker/gpu_worker.py:430] Free memory after profiling: 6.35 GiB (total), 3.93 GiB (within requested) -(Worker_TP1 pid=443) DEBUG 04-22 00:02:01 [v1/worker/gpu_worker.py:435] Memory profiling takes 36.19 seconds. Total non KV cache memory: 72.21GiB; torch peak memory increase: 2.29GiB; non-torch forward increase memory: 2.09GiB; weights memory: 67.82GiB. -(Worker_TP1 pid=443) INFO 04-22 00:02:01 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9705 to maintain the same effective KV cache size. -(Worker_TP1 pid=443) DEBUG 04-22 00:02:01 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=442) DEBUG 04-22 00:02:01 [v1/worker/gpu_worker.py:424] Initial free memory: 77.66 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP0 pid=442) DEBUG 04-22 00:02:01 [v1/worker/gpu_worker.py:430] Free memory after profiling: 6.35 GiB (total), 3.93 GiB (within requested) -(Worker_TP0 pid=442) DEBUG 04-22 00:02:01 [v1/worker/gpu_worker.py:435] Memory profiling takes 36.26 seconds. Total non KV cache memory: 72.21GiB; torch peak memory increase: 2.29GiB; non-torch forward increase memory: 2.09GiB; weights memory: 67.82GiB. -(Worker_TP0 pid=442) INFO 04-22 00:02:01 [v1/worker/gpu_worker.py:436] Available KV cache memory: 3.02 GiB -(Worker_TP0 pid=442) INFO 04-22 00:02:01 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9705 to maintain the same effective KV cache size. -(Worker_TP0 pid=442) DEBUG 04-22 00:02:01 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=243) DEBUG 04-22 00:02:01 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=243) INFO 04-22 00:02:01 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 19,776 tokens -(EngineCore pid=243) INFO 04-22 00:02:01 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 2.41x -(Worker_TP0 pid=442) DEBUG 04-22 00:02:01 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=443) DEBUG 04-22 00:02:01 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=443) INFO 04-22 00:02:01 [v1/worker/gpu_worker.py:578] Compile and warming up model for size 8192 -(Worker_TP0 pid=442) INFO 04-22 00:02:01 [v1/worker/gpu_worker.py:578] Compile and warming up model for size 8192 -(Worker_TP1 pid=443) DEBUG 04-22 00:02:01 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) DEBUG 04-22 00:02:01 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=443) 2026-04-22 00:02:01,768 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP0 pid=442) 2026-04-22 00:02:01,768 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP1 pid=443) DEBUG 04-22 00:02:01 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) DEBUG 04-22 00:02:01 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:02:02 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=442) 2026-04-22 00:02:02,650 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP1 pid=443) 2026-04-22 00:02:02,650 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP0 pid=442) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=243) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=243) INFO 04-22 00:02:14 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(Worker_TP1 pid=443) DEBUG 04-22 00:02:14 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=442) DEBUG 04-22 00:02:14 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=243) DEBUG 04-22 00:02:14 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=243) DEBUG 04-22 00:02:14 [v1/engine/core.py:1158] EngineCore waiting for work. -(EngineCore pid=243) DEBUG 04-22 00:02:14 [v1/engine/core.py:1158] EngineCore waiting for work. -(APIServer pid=1) INFO 04-22 00:02:14 [entrypoints/openai/api_server.py:590] Supported tasks: ['generate'] -(APIServer pid=1) DEBUG 04-22 00:02:15 [renderers/base.py:197] Warming up chat template processing... -(APIServer pid=1) DEBUG 04-22 00:02:15 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e81007-4b54106943b22e9725ac587c;f5e0296d-657f-4ca7-b92e-fc2b46bcc673) -(APIServer pid=1) DEBUG 04-22 00:02:15 [transformers_utils/repo_utils.py:243] -(APIServer pid=1) DEBUG 04-22 00:02:15 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/moonshotai/Kimi-Dev-72B/resolve/main/processor_config.json. -(APIServer pid=1) DEBUG 04-22 00:02:15 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e81007-0795e5ec52136b1d03088420;101c0272-f6f3-4106-9a59-ba75fbc2f711) -(APIServer pid=1) DEBUG 04-22 00:02:15 [transformers_utils/repo_utils.py:243] -(APIServer pid=1) DEBUG 04-22 00:02:15 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/moonshotai/Kimi-Dev-72B/resolve/main/preprocessor_config.json. -(Worker_TP0 pid=442) DEBUG 04-22 00:02:15 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=443) DEBUG 04-22 00:02:15 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(APIServer pid=1) INFO 04-22 00:02:15 [renderers/hf.py:314] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this. -(APIServer pid=1) DEBUG 04-22 00:02:15 [renderers/base.py:203] Chat template warmup completed in 0.740s -(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/openai/api_server.py:594] Starting vLLM server on http://0.0.0.0:8000 -(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:37] Available routes are: -(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /openapi.json, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /docs, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /docs/oauth2-redirect, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /redoc, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /tokenize, Methods: POST -(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /detokenize, Methods: POST -(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /load, Methods: GET -(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /version, Methods: GET -(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /health, Methods: GET -(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /metrics, Methods: GET -(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /v1/models, Methods: GET -(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /ping, Methods: GET -(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /ping, Methods: POST -(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /invocations, Methods: POST -(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /v1/chat/completions, Methods: POST -(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST -(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /v1/responses, Methods: POST -(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET -(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST -(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /v1/completions, Methods: POST -(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /v1/messages, Methods: POST -(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST -(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /inference/v1/generate, Methods: POST -(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /scale_elastic_ep, Methods: POST -(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST -(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /v1/chat/completions/render, Methods: POST -(APIServer pid=1) INFO 04-22 00:02:16 [entrypoints/launcher.py:46] Route: /v1/completions/render, Methods: POST -(APIServer pid=1) INFO: Started server process [1] -(APIServer pid=1) INFO: Waiting for application startup. -(APIServer pid=1) INFO: Application startup complete. -(APIServer pid=1) DEBUG 04-22 00:02:23 [v1/engine/async_llm.py:875] Called check_health. -(APIServer pid=1) INFO: 10.128.4.2:33468 - "GET /health HTTP/1.1" 200 OK diff --git a/accuracy/results/v0.19.0/logs/moonshotai-kimi-dev-72b--h100-80gb--tp4pp1dp1--8192.FAILED.log b/accuracy/results/v0.19.0/logs/moonshotai-kimi-dev-72b--h100-80gb--tp4pp1dp1--8192.FAILED.log deleted file mode 100644 index cd8bb00d..00000000 --- a/accuracy/results/v0.19.0/logs/moonshotai-kimi-dev-72b--h100-80gb--tp4pp1dp1--8192.FAILED.log +++ /dev/null @@ -1,399 +0,0 @@ -DEBUG 04-22 00:02:38 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:02:38 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:02:38 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:02:38 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:02:38 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:02:38 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:02:38 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:02:38 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:02:38 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:02:38 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:02:38 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:02:38 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:02:43 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 00:02:45 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' -DEBUG 04-22 00:02:45 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 00:02:45 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:02:45 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:02:45 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 00:02:45 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:02:45 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 00:02:45 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 00:02:45 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model moonshotai/Kimi-Dev-72B -(APIServer pid=1) INFO 04-22 00:02:45 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 00:02:45 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:02:45 [entrypoints/utils.py:233] non-default args: {'model_tag': 'moonshotai/Kimi-Dev-72B', 'model': 'moonshotai/Kimi-Dev-72B', 'trust_remote_code': True, 'max_model_len': 8192, 'tensor_parallel_size': 4, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 00:02:45 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 00:02:45 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen2.Qwen2ForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 00:02:45 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0041262 secs -(APIServer pid=1) INFO 04-22 00:02:45 [config/model.py:549] Resolved architecture: Qwen2ForCausalLM -(APIServer pid=1) INFO 04-22 00:02:45 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 00:02:45 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 00:02:45 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 00:02:45 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 00:02:45 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 00:02:45 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 00:02:45 [config/parallel.py:743] Defaulting to use mp for distributed inference -(APIServer pid=1) INFO 04-22 00:02:45 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 00:02:45 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(APIServer pid=1) INFO 04-22 00:02:47 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(APIServer pid=1) DEBUG 04-22 00:02:47 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 00:02:47 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:02:47 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:02:47 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 00:02:51 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:02:51 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:02:51 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:02:51 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:02:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:02:51 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:02:51 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:02:51 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:02:51 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:02:51 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:02:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:02:51 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:02:56 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=245) DEBUG 04-22 00:02:57 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 00:02:57 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=245) DEBUG 04-22 00:02:57 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/24a4d7cd-445c-4acd-85bf-2b111701ad2a'], outputs=['ipc:///tmp/a8007d5e-3590-4db5-b7d9-eb2caffb59d6'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=245) DEBUG 04-22 00:02:57 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=245) DEBUG 04-22 00:02:57 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=245) DEBUG 04-22 00:02:57 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=245) DEBUG 04-22 00:02:57 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=245) DEBUG 04-22 00:02:57 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=245) INFO 04-22 00:02:57 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='moonshotai/Kimi-Dev-72B', speculative_config=None, tokenizer='moonshotai/Kimi-Dev-72B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=4, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=moonshotai/Kimi-Dev-72B, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [128, 8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=245) WARNING 04-22 00:02:57 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. -(EngineCore pid=245) INFO 04-22 00:02:57 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.131.7.77 (local), world_size=4, local_world_size=4 -(EngineCore pid=245) DEBUG 04-22 00:02:57 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/0da9bf7e-69ce-4b6c-a385-5dc105766277 -(EngineCore pid=245) DEBUG 04-22 00:02:57 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1, 2, 3], buffer_handle=(4, 16777216, 10, 'psm_d1ae5a57'), local_subscribe_addr='ipc:///tmp/0da9bf7e-69ce-4b6c-a385-5dc105766277', local_notify_addr='ipc:///tmp/74d47614-c921-443b-b79b-34b5af61b7a5', remote_subscribe_addr=None, remote_addr_ipv6=False) -DEBUG 04-22 00:03:01 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:03:01 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:03:01 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:03:01 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:03:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:03:01 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:03:01 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:03:01 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:03:01 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:03:01 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:03:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:03:01 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:03:01 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:03:01 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:03:01 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:03:01 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:03:01 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:03:01 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:03:01 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:03:01 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:03:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:03:01 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:03:01 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:03:01 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:03:01 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:03:01 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:03:01 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:03:01 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:03:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:03:01 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:03:01 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:03:01 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:03:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:03:01 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:03:01 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:03:01 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:03:01 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:03:01 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:03:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:03:01 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:03:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:03:01 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:03:01 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:03:01 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:03:01 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:03:01 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:03:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:03:01 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:03:05 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 00:03:06 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 00:03:06 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 00:03:06 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 00:03:07 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 00:03:07 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:03:07 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:03:07 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 00:03:07 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 00:03:07 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:03:07 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:03:07 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 00:03:07 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 00:03:07 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:03:07 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:03:07 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 00:03:07 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 00:03:07 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:03:07 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:03:07 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) DEBUG 04-22 00:03:07 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker pid=447) DEBUG 04-22 00:03:08 [distributed/parallel_state.py:1356] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:49893 backend=nccl -(Worker pid=447) INFO 04-22 00:03:08 [distributed/parallel_state.py:1400] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:49893 backend=nccl -(Worker pid=444) DEBUG 04-22 00:03:08 [distributed/parallel_state.py:1356] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:49893 backend=nccl -(Worker pid=444) INFO 04-22 00:03:08 [distributed/parallel_state.py:1400] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:49893 backend=nccl -(Worker pid=446) DEBUG 04-22 00:03:08 [distributed/parallel_state.py:1356] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:49893 backend=nccl -(Worker pid=446) INFO 04-22 00:03:08 [distributed/parallel_state.py:1400] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:49893 backend=nccl -(Worker pid=445) DEBUG 04-22 00:03:08 [distributed/parallel_state.py:1356] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:49893 backend=nccl -(Worker pid=445) INFO 04-22 00:03:08 [distributed/parallel_state.py:1400] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:49893 backend=nccl -[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -(Worker pid=445) DEBUG 04-22 00:03:08 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=446) DEBUG 04-22 00:03:08 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=444) DEBUG 04-22 00:03:08 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=447) DEBUG 04-22 00:03:08 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -(Worker pid=444) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=445) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=444) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=445) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=446) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=446) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=447) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=447) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=444) DEBUG 04-22 00:03:08 [utils/nccl.py:34] Found nccl from library libnccl.so.2 -(Worker pid=444) INFO 04-22 00:03:08 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 -(Worker pid=445) DEBUG 04-22 00:03:09 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=444) DEBUG 04-22 00:03:09 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=446) DEBUG 04-22 00:03:09 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=447) DEBUG 04-22 00:03:09 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=444) DEBUG 04-22 00:03:09 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/2869822b-8e70-436a-bbad-a93e3e91136e -(Worker pid=444) DEBUG 04-22 00:03:09 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1, 2, 3], buffer_handle=(3, 4194304, 6, 'psm_2bf3a753'), local_subscribe_addr='ipc:///tmp/2869822b-8e70-436a-bbad-a93e3e91136e', local_notify_addr='ipc:///tmp/5140ae1d-9907-467e-83e9-4c140f16fe69', remote_subscribe_addr=None, remote_addr_ipv6=False) -(Worker pid=447) DEBUG 04-22 00:03:09 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/2869822b-8e70-436a-bbad-a93e3e91136e -(Worker pid=445) DEBUG 04-22 00:03:09 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/2869822b-8e70-436a-bbad-a93e3e91136e -(Worker pid=446) DEBUG 04-22 00:03:09 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/2869822b-8e70-436a-bbad-a93e3e91136e -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(Worker pid=444) INFO 04-22 00:03:09 [distributed/parallel_state.py:1716] rank 0 in world size 4 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(Worker pid=446) DEBUG 04-22 00:03:10 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776816190.2286594, auto_measure=True -(Worker pid=446) DEBUG 04-22 00:03:10 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=444) DEBUG 04-22 00:03:10 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776816190.2827969, auto_measure=True -(Worker pid=444) DEBUG 04-22 00:03:10 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] WorkerProc failed to start. -(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] Traceback (most recent call last): -(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 826, in worker_main -(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] worker = WorkerProc(*args, **kwargs) -(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) -(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ -(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 605, in __init__ -(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] self.worker.init_device() -(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/worker_base.py", line 312, in init_device -(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] self.worker.init_device() # type: ignore -(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) -(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ -(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 283, in init_device -(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] self.requested_memory = request_memory(init_snapshot, self.cache_config) -(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/utils.py", line 413, in request_memory -(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] raise ValueError( -(Worker pid=445) ERROR 04-22 00:03:10 [v1/executor/multiproc_executor.py:857] ValueError: Free memory on device cuda:1 (60.85/79.19 GiB) on startup is less than desired GPU memory utilization (0.95, 75.23 GiB). Decrease GPU memory utilization or reduce GPU memory used by other processes. -(EngineCore pid=245) DEBUG 04-22 00:03:10 [v1/executor/multiproc_executor.py:419] Worker Termination: allow workers to gracefully shutdown -(Worker pid=447) DEBUG 04-22 00:03:10 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776816190.2981615, auto_measure=True -(Worker pid=447) DEBUG 04-22 00:03:10 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=446) DEBUG 04-22 00:03:10 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=446) DEBUG 04-22 00:03:10 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=446) DEBUG 04-22 00:03:10 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=446) DEBUG 04-22 00:03:10 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=444) DEBUG 04-22 00:03:10 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=444) DEBUG 04-22 00:03:10 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=446) DEBUG 04-22 00:03:10 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=444) DEBUG 04-22 00:03:10 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=447) DEBUG 04-22 00:03:10 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=447) DEBUG 04-22 00:03:10 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=447) DEBUG 04-22 00:03:10 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=444) DEBUG 04-22 00:03:10 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=444) DEBUG 04-22 00:03:10 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(Worker pid=444) DEBUG 04-22 00:03:10 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=447) DEBUG 04-22 00:03:10 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=447) DEBUG 04-22 00:03:10 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=444) DEBUG 04-22 00:03:10 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(Worker_TP0 pid=444) INFO 04-22 00:03:10 [v1/worker/gpu_model_runner.py:4735] Starting to load model moonshotai/Kimi-Dev-72B... -(Worker_TP0 pid=444) DEBUG 04-22 00:03:10 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(Worker_TP0 pid=444) INFO 04-22 00:03:10 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(Worker_TP0 pid=444) INFO 04-22 00:03:10 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(Worker_TP2 pid=446) DEBUG 04-22 00:03:11 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP2 pid=446) DEBUG 04-22 00:03:11 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP2 pid=446) DEBUG 04-22 00:03:11 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP2 pid=446) DEBUG 04-22 00:03:11 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP0 pid=444) DEBUG 04-22 00:03:11 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP3 pid=447) DEBUG 04-22 00:03:11 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP0 pid=444) DEBUG 04-22 00:03:11 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP0 pid=444) DEBUG 04-22 00:03:11 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP0 pid=444) DEBUG 04-22 00:03:11 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP3 pid=447) DEBUG 04-22 00:03:11 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP3 pid=447) DEBUG 04-22 00:03:11 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP3 pid=447) DEBUG 04-22 00:03:11 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP2 pid=446) DEBUG 04-22 00:03:11 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-10-of-80.safetensors', 'model-39-of-80.safetensors', 'model-15-of-80.safetensors', 'model-18-of-80.safetensors', 'model-14-of-80.safetensors', 'model-51-of-80.safetensors', 'model-66-of-80.safetensors', 'model-55-of-80.safetensors', 'model-21-of-80.safetensors', 'model-22-of-80.safetensors', 'model-36-of-80.safetensors', 'model-8-of-80.safetensors', 'model-79-of-80.safetensors', 'model-50-of-80.safetensors', 'model-40-of-80.safetensors', 'model-57-of-80.safetensors', 'model-30-of-80.safetensors', 'model-41-of-80.safetensors', 'model-32-of-80.safetensors', 'model-25-of-80.safetensors', 'model-24-of-80.safetensors', 'model-12-of-80.safetensors', 'model-77-of-80.safetensors', 'model-61-of-80.safetensors', 'model-73-of-80.safetensors', 'model-53-of-80.safetensors', 'model-4-of-80.safetensors', 'model-17-of-80.safetensors', 'model-38-of-80.safetensors', 'model-71-of-80.safetensors', 'model-6-of-80.safetensors', 'model-31-of-80.safetensors', 'model-64-of-80.safetensors', 'model-1-of-80.safetensors', 'model-54-of-80.safetensors', 'model-45-of-80.safetensors', 'model-35-of-80.safetensors', 'model-13-of-80.safetensors', 'model-37-of-80.safetensors', 'model-78-of-80.safetensors', 'model-72-of-80.safetensors', 'model-74-of-80.safetensors', 'model-3-of-80.safetensors', 'model-46-of-80.safetensors', 'model-29-of-80.safetensors', 'model-27-of-80.safetensors', 'model-20-of-80.safetensors', 'model-75-of-80.safetensors', 'model-63-of-80.safetensors', 'model-44-of-80.safetensors', 'model-26-of-80.safetensors', 'model-23-of-80.safetensors', 'model-65-of-80.safetensors', 'model-62-of-80.safetensors', 'model-70-of-80.safetensors', 'model-9-of-80.safetensors', 'model-56-of-80.safetensors', 'model-42-of-80.safetensors', 'model-5-of-80.safetensors', 'model-58-of-80.safetensors', 'model-47-of-80.safetensors', 'model-52-of-80.safetensors', 'model-80-of-80.safetensors', 'model-67-of-80.safetensors', 'model-69-of-80.safetensors', 'model-59-of-80.safetensors', 'model-19-of-80.safetensors', 'model-7-of-80.safetensors', 'model-49-of-80.safetensors', 'model-76-of-80.safetensors', 'model-68-of-80.safetensors', 'model-16-of-80.safetensors', 'model-43-of-80.safetensors', 'model-2-of-80.safetensors', 'model-28-of-80.safetensors', 'model-34-of-80.safetensors', 'model-33-of-80.safetensors', 'model-11-of-80.safetensors', 'model-60-of-80.safetensors', 'model-48-of-80.safetensors']] -(Worker_TP0 pid=444) DEBUG 04-22 00:03:11 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-62-of-80.safetensors', 'model-1-of-80.safetensors', 'model-33-of-80.safetensors', 'model-31-of-80.safetensors', 'model-9-of-80.safetensors', 'model-13-of-80.safetensors', 'model-58-of-80.safetensors', 'model-21-of-80.safetensors', 'model-2-of-80.safetensors', 'model-39-of-80.safetensors', 'model-63-of-80.safetensors', 'model-60-of-80.safetensors', 'model-29-of-80.safetensors', 'model-10-of-80.safetensors', 'model-54-of-80.safetensors', 'model-23-of-80.safetensors', 'model-75-of-80.safetensors', 'model-36-of-80.safetensors', 'model-64-of-80.safetensors', 'model-28-of-80.safetensors', 'model-67-of-80.safetensors', 'model-48-of-80.safetensors', 'model-49-of-80.safetensors', 'model-8-of-80.safetensors', 'model-45-of-80.safetensors', 'model-14-of-80.safetensors', 'model-40-of-80.safetensors', 'model-7-of-80.safetensors', 'model-65-of-80.safetensors', 'model-19-of-80.safetensors', 'model-5-of-80.safetensors', 'model-73-of-80.safetensors', 'model-53-of-80.safetensors', 'model-77-of-80.safetensors', 'model-35-of-80.safetensors', 'model-37-of-80.safetensors', 'model-41-of-80.safetensors', 'model-38-of-80.safetensors', 'model-18-of-80.safetensors', 'model-3-of-80.safetensors', 'model-52-of-80.safetensors', 'model-76-of-80.safetensors', 'model-51-of-80.safetensors', 'model-16-of-80.safetensors', 'model-34-of-80.safetensors', 'model-17-of-80.safetensors', 'model-47-of-80.safetensors', 'model-42-of-80.safetensors', 'model-27-of-80.safetensors', 'model-70-of-80.safetensors', 'model-74-of-80.safetensors', 'model-46-of-80.safetensors', 'model-12-of-80.safetensors', 'model-71-of-80.safetensors', 'model-66-of-80.safetensors', 'model-69-of-80.safetensors', 'model-61-of-80.safetensors', 'model-57-of-80.safetensors', 'model-43-of-80.safetensors', 'model-25-of-80.safetensors', 'model-79-of-80.safetensors', 'model-26-of-80.safetensors', 'model-22-of-80.safetensors', 'model-68-of-80.safetensors', 'model-30-of-80.safetensors', 'model-20-of-80.safetensors', 'model-59-of-80.safetensors', 'model-80-of-80.safetensors', 'model-24-of-80.safetensors', 'model-6-of-80.safetensors', 'model-72-of-80.safetensors', 'model-55-of-80.safetensors', 'model-78-of-80.safetensors', 'model-56-of-80.safetensors', 'model-50-of-80.safetensors', 'model-4-of-80.safetensors', 'model-32-of-80.safetensors', 'model-11-of-80.safetensors', 'model-15-of-80.safetensors', 'model-44-of-80.safetensors']] -(Worker_TP3 pid=447) DEBUG 04-22 00:03:11 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-67-of-80.safetensors', 'model-50-of-80.safetensors', 'model-51-of-80.safetensors', 'model-63-of-80.safetensors', 'model-66-of-80.safetensors', 'model-68-of-80.safetensors', 'model-39-of-80.safetensors', 'model-15-of-80.safetensors', 'model-3-of-80.safetensors', 'model-30-of-80.safetensors', 'model-77-of-80.safetensors', 'model-47-of-80.safetensors', 'model-55-of-80.safetensors', 'model-57-of-80.safetensors', 'model-7-of-80.safetensors', 'model-11-of-80.safetensors', 'model-80-of-80.safetensors', 'model-56-of-80.safetensors', 'model-10-of-80.safetensors', 'model-35-of-80.safetensors', 'model-65-of-80.safetensors', 'model-21-of-80.safetensors', 'model-74-of-80.safetensors', 'model-1-of-80.safetensors', 'model-19-of-80.safetensors', 'model-4-of-80.safetensors', 'model-2-of-80.safetensors', 'model-18-of-80.safetensors', 'model-33-of-80.safetensors', 'model-22-of-80.safetensors', 'model-12-of-80.safetensors', 'model-61-of-80.safetensors', 'model-41-of-80.safetensors', 'model-58-of-80.safetensors', 'model-48-of-80.safetensors', 'model-42-of-80.safetensors', 'model-38-of-80.safetensors', 'model-25-of-80.safetensors', 'model-20-of-80.safetensors', 'model-78-of-80.safetensors', 'model-46-of-80.safetensors', 'model-37-of-80.safetensors', 'model-13-of-80.safetensors', 'model-9-of-80.safetensors', 'model-23-of-80.safetensors', 'model-59-of-80.safetensors', 'model-44-of-80.safetensors', 'model-72-of-80.safetensors', 'model-43-of-80.safetensors', 'model-40-of-80.safetensors', 'model-73-of-80.safetensors', 'model-24-of-80.safetensors', 'model-17-of-80.safetensors', 'model-49-of-80.safetensors', 'model-29-of-80.safetensors', 'model-6-of-80.safetensors', 'model-76-of-80.safetensors', 'model-45-of-80.safetensors', 'model-52-of-80.safetensors', 'model-34-of-80.safetensors', 'model-8-of-80.safetensors', 'model-27-of-80.safetensors', 'model-53-of-80.safetensors', 'model-26-of-80.safetensors', 'model-70-of-80.safetensors', 'model-16-of-80.safetensors', 'model-31-of-80.safetensors', 'model-69-of-80.safetensors', 'model-36-of-80.safetensors', 'model-14-of-80.safetensors', 'model-60-of-80.safetensors', 'model-28-of-80.safetensors', 'model-62-of-80.safetensors', 'model-54-of-80.safetensors', 'model-75-of-80.safetensors', 'model-64-of-80.safetensors', 'model-32-of-80.safetensors', 'model-79-of-80.safetensors', 'model-5-of-80.safetensors', 'model-71-of-80.safetensors']] -(Worker_TP0 pid=444) Loading safetensors checkpoint shards: 0% Completed | 0/80 [00:00 -(APIServer pid=1) sys.exit(main()) -(APIServer pid=1) ^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main -(APIServer pid=1) args.dispatch_function(args) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd -(APIServer pid=1) uvloop.run(run_server(args)) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run -(APIServer pid=1) return __asyncio.run( -(APIServer pid=1) ^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run -(APIServer pid=1) return runner.run(main) -(APIServer pid=1) ^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run -(APIServer pid=1) return self._loop.run_until_complete(task) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper -(APIServer pid=1) return await main -(APIServer pid=1) ^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server -(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker -(APIServer pid=1) async with build_async_engine_client( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ -(APIServer pid=1) return await anext(self.gen) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client -(APIServer pid=1) async with build_async_engine_client_from_engine_args( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ -(APIServer pid=1) return await anext(self.gen) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 136, in build_async_engine_client_from_engine_args -(APIServer pid=1) async_llm = AsyncLLM.from_vllm_config( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 225, in from_vllm_config -(APIServer pid=1) return cls( -(APIServer pid=1) ^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 154, in __init__ -(APIServer pid=1) self.engine_core = EngineCoreClient.make_async_mp_client( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(APIServer pid=1) return func(*args, **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 130, in make_async_mp_client -(APIServer pid=1) return AsyncMPClient(*client_args) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(APIServer pid=1) return func(*args, **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 887, in __init__ -(APIServer pid=1) super().__init__( -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 535, in __init__ -(APIServer pid=1) with launch_core_engines( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 144, in __exit__ -(APIServer pid=1) next(self.gen) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 998, in launch_core_engines -(APIServer pid=1) wait_for_engine_startup( -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 1057, in wait_for_engine_startup -(APIServer pid=1) raise RuntimeError( -(APIServer pid=1) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} -/usr/lib/python3.12/multiprocessing/resource_tracker.py:279: UserWarning: resource_tracker: There appear to be 1 leaked shared_memory objects to clean up at shutdown - warnings.warn('resource_tracker: There appear to be %d ' diff --git a/accuracy/results/v0.19.0/logs/moonshotai-kimi-dev-72b--h100-80gb--tp4pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/moonshotai-kimi-dev-72b--h100-80gb--tp4pp1dp1--8192.log deleted file mode 100644 index 6b00b529..00000000 --- a/accuracy/results/v0.19.0/logs/moonshotai-kimi-dev-72b--h100-80gb--tp4pp1dp1--8192.log +++ /dev/null @@ -1,2854 +0,0 @@ -DEBUG 04-22 01:32:24 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:32:24 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:32:24 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:32:24 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:32:24 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:32:24 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:32:24 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:32:24 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:32:24 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:32:24 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:32:24 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:32:24 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:32:29 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:32:30 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' -DEBUG 04-22 01:32:30 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:32:30 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:32:30 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:32:30 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 01:32:31 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:32:31 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 01:32:31 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 01:32:31 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model moonshotai/Kimi-Dev-72B -(APIServer pid=1) INFO 04-22 01:32:31 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 01:32:31 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:32:31 [entrypoints/utils.py:233] non-default args: {'model_tag': 'moonshotai/Kimi-Dev-72B', 'model': 'moonshotai/Kimi-Dev-72B', 'trust_remote_code': True, 'max_model_len': 8192, 'tensor_parallel_size': 4, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 01:32:31 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 01:32:31 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen2.Qwen2ForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 01:32:31 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0007761 secs -(APIServer pid=1) INFO 04-22 01:32:31 [config/model.py:549] Resolved architecture: Qwen2ForCausalLM -(APIServer pid=1) INFO 04-22 01:32:31 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 01:32:31 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 01:32:31 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 01:32:31 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 01:32:31 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 01:32:31 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 01:32:31 [config/parallel.py:743] Defaulting to use mp for distributed inference -(APIServer pid=1) INFO 04-22 01:32:31 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 01:32:31 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(APIServer pid=1) INFO 04-22 01:32:33 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(APIServer pid=1) DEBUG 04-22 01:32:33 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 01:32:33 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:32:34 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:32:34 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 01:32:37 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:32:37 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:32:37 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:32:37 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:32:38 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:32:38 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:32:38 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:32:38 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:32:38 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:32:38 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:32:38 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:32:38 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:32:42 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=243) DEBUG 04-22 01:32:44 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 01:32:44 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=243) DEBUG 04-22 01:32:44 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/9bc312a7-0b33-4921-9fae-b1f2ff7db368'], outputs=['ipc:///tmp/d3d42416-6cd7-4dcf-9c68-fe976bc2ee76'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=243) DEBUG 04-22 01:32:44 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=243) DEBUG 04-22 01:32:44 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=243) DEBUG 04-22 01:32:44 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=243) DEBUG 04-22 01:32:44 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=243) DEBUG 04-22 01:32:44 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=243) INFO 04-22 01:32:44 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='moonshotai/Kimi-Dev-72B', speculative_config=None, tokenizer='moonshotai/Kimi-Dev-72B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=4, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=moonshotai/Kimi-Dev-72B, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [128, 8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=243) WARNING 04-22 01:32:44 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. -(EngineCore pid=243) INFO 04-22 01:32:44 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.131.3.122 (local), world_size=4, local_world_size=4 -(EngineCore pid=243) DEBUG 04-22 01:32:44 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/a440f459-16ec-4f4b-b25a-28418caf8c99 -(EngineCore pid=243) DEBUG 04-22 01:32:44 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1, 2, 3], buffer_handle=(4, 16777216, 10, 'psm_7e5af658'), local_subscribe_addr='ipc:///tmp/a440f459-16ec-4f4b-b25a-28418caf8c99', local_notify_addr='ipc:///tmp/638a17a4-2ae0-4a34-86c2-b2ff6ae03f64', remote_subscribe_addr=None, remote_addr_ipv6=False) -DEBUG 04-22 01:32:47 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:32:47 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:32:47 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:32:47 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:32:47 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:32:47 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:32:47 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:32:47 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:32:47 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:32:47 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:32:47 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:32:47 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:32:47 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:32:47 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:32:47 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:32:47 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:32:47 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:32:47 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:32:47 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:32:47 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:32:47 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:32:47 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:32:47 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:32:47 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:32:47 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:32:47 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:32:47 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:32:47 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:32:47 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:32:47 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:32:47 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:32:47 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:32:47 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:32:47 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:32:47 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:32:47 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:32:47 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:32:47 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:32:47 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:32:47 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:32:47 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:32:47 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:32:47 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:32:47 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:32:47 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:32:47 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:32:47 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:32:47 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:32:52 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:32:52 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:32:52 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:32:52 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(APIServer pid=1) DEBUG 04-22 01:32:54 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -DEBUG 04-22 01:32:54 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:32:54 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:32:54 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:32:54 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 01:32:54 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:32:54 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:32:54 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:32:54 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 01:32:54 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:32:54 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:32:54 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:32:54 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 01:32:54 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:32:54 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:32:54 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:32:54 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(Worker pid=442) DEBUG 04-22 01:32:55 [distributed/parallel_state.py:1356] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:60879 backend=nccl -(Worker pid=442) INFO 04-22 01:32:55 [distributed/parallel_state.py:1400] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:60879 backend=nccl -(Worker pid=443) DEBUG 04-22 01:32:55 [distributed/parallel_state.py:1356] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:60879 backend=nccl -(Worker pid=443) INFO 04-22 01:32:55 [distributed/parallel_state.py:1400] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:60879 backend=nccl -(Worker pid=445) DEBUG 04-22 01:32:55 [distributed/parallel_state.py:1356] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:60879 backend=nccl -(Worker pid=445) INFO 04-22 01:32:55 [distributed/parallel_state.py:1400] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:60879 backend=nccl -(Worker pid=444) DEBUG 04-22 01:32:55 [distributed/parallel_state.py:1356] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:60879 backend=nccl -(Worker pid=444) INFO 04-22 01:32:55 [distributed/parallel_state.py:1400] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:60879 backend=nccl -[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -(Worker pid=442) DEBUG 04-22 01:32:55 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=444) DEBUG 04-22 01:32:55 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=443) DEBUG 04-22 01:32:55 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=445) DEBUG 04-22 01:32:55 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -(Worker pid=444) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=444) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=443) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=443) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=442) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=442) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=445) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=445) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=442) DEBUG 04-22 01:32:55 [utils/nccl.py:34] Found nccl from library libnccl.so.2 -(Worker pid=442) INFO 04-22 01:32:55 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 -(Worker pid=443) DEBUG 04-22 01:32:56 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=445) DEBUG 04-22 01:32:56 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=442) DEBUG 04-22 01:32:56 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=444) DEBUG 04-22 01:32:56 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=442) DEBUG 04-22 01:32:57 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/2553b6ac-4d99-40c7-8c8e-99b100945ec0 -(Worker pid=442) DEBUG 04-22 01:32:57 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1, 2, 3], buffer_handle=(3, 4194304, 6, 'psm_de4932bf'), local_subscribe_addr='ipc:///tmp/2553b6ac-4d99-40c7-8c8e-99b100945ec0', local_notify_addr='ipc:///tmp/153501ca-24ac-4494-b023-cabdee1b39ce', remote_subscribe_addr=None, remote_addr_ipv6=False) -(Worker pid=444) DEBUG 04-22 01:32:57 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/2553b6ac-4d99-40c7-8c8e-99b100945ec0 -(Worker pid=443) DEBUG 04-22 01:32:57 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/2553b6ac-4d99-40c7-8c8e-99b100945ec0 -(Worker pid=445) DEBUG 04-22 01:32:57 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/2553b6ac-4d99-40c7-8c8e-99b100945ec0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(Worker pid=442) INFO 04-22 01:32:57 [distributed/parallel_state.py:1716] rank 0 in world size 4 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(Worker pid=442) DEBUG 04-22 01:32:57 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776821577.3190374, auto_measure=True -(Worker pid=442) DEBUG 04-22 01:32:57 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=443) DEBUG 04-22 01:32:57 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776821577.3577726, auto_measure=True -(Worker pid=443) DEBUG 04-22 01:32:57 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=444) DEBUG 04-22 01:32:57 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776821577.3819945, auto_measure=True -(Worker pid=444) DEBUG 04-22 01:32:57 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=445) DEBUG 04-22 01:32:57 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776821577.4167647, auto_measure=True -(Worker pid=445) DEBUG 04-22 01:32:57 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=442) DEBUG 04-22 01:32:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=442) DEBUG 04-22 01:32:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=442) DEBUG 04-22 01:32:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=443) DEBUG 04-22 01:32:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=443) DEBUG 04-22 01:32:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=442) DEBUG 04-22 01:32:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=442) DEBUG 04-22 01:32:57 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(Worker pid=443) DEBUG 04-22 01:32:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=442) DEBUG 04-22 01:32:57 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=444) DEBUG 04-22 01:32:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=444) DEBUG 04-22 01:32:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=444) DEBUG 04-22 01:32:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=443) DEBUG 04-22 01:32:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=445) DEBUG 04-22 01:32:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=445) DEBUG 04-22 01:32:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=443) DEBUG 04-22 01:32:57 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=445) DEBUG 04-22 01:32:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=444) DEBUG 04-22 01:32:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=442) DEBUG 04-22 01:32:57 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(Worker pid=444) DEBUG 04-22 01:32:57 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker_TP0 pid=442) INFO 04-22 01:32:57 [v1/worker/gpu_model_runner.py:4735] Starting to load model moonshotai/Kimi-Dev-72B... -(Worker pid=445) DEBUG 04-22 01:32:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=445) DEBUG 04-22 01:32:57 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker_TP0 pid=442) DEBUG 04-22 01:32:57 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(Worker_TP0 pid=442) INFO 04-22 01:32:57 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(Worker_TP0 pid=442) INFO 04-22 01:32:57 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(Worker_TP0 pid=442) DEBUG 04-22 01:32:58 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP0 pid=442) DEBUG 04-22 01:32:58 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP0 pid=442) DEBUG 04-22 01:32:58 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP0 pid=442) DEBUG 04-22 01:32:58 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP1 pid=443) DEBUG 04-22 01:32:58 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP1 pid=443) DEBUG 04-22 01:32:58 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP1 pid=443) DEBUG 04-22 01:32:58 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP1 pid=443) DEBUG 04-22 01:32:58 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP3 pid=445) DEBUG 04-22 01:32:58 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP3 pid=445) DEBUG 04-22 01:32:58 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP3 pid=445) DEBUG 04-22 01:32:58 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP3 pid=445) DEBUG 04-22 01:32:58 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP2 pid=444) DEBUG 04-22 01:32:58 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP2 pid=444) DEBUG 04-22 01:32:58 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP2 pid=444) DEBUG 04-22 01:32:58 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP2 pid=444) DEBUG 04-22 01:32:58 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP0 pid=442) DEBUG 04-22 01:32:58 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-22-of-80.safetensors', 'model-32-of-80.safetensors', 'model-46-of-80.safetensors', 'model-36-of-80.safetensors', 'model-53-of-80.safetensors', 'model-29-of-80.safetensors', 'model-8-of-80.safetensors', 'model-4-of-80.safetensors', 'model-24-of-80.safetensors', 'model-2-of-80.safetensors', 'model-12-of-80.safetensors', 'model-59-of-80.safetensors', 'model-11-of-80.safetensors', 'model-45-of-80.safetensors', 'model-72-of-80.safetensors', 'model-13-of-80.safetensors', 'model-51-of-80.safetensors', 'model-33-of-80.safetensors', 'model-1-of-80.safetensors', 'model-67-of-80.safetensors', 'model-30-of-80.safetensors', 'model-74-of-80.safetensors', 'model-80-of-80.safetensors', 'model-76-of-80.safetensors', 'model-26-of-80.safetensors', 'model-71-of-80.safetensors', 'model-44-of-80.safetensors', 'model-31-of-80.safetensors', 'model-78-of-80.safetensors', 'model-57-of-80.safetensors', 'model-6-of-80.safetensors', 'model-39-of-80.safetensors', 'model-7-of-80.safetensors', 'model-16-of-80.safetensors', 'model-28-of-80.safetensors', 'model-42-of-80.safetensors', 'model-56-of-80.safetensors', 'model-79-of-80.safetensors', 'model-61-of-80.safetensors', 'model-3-of-80.safetensors', 'model-38-of-80.safetensors', 'model-21-of-80.safetensors', 'model-5-of-80.safetensors', 'model-68-of-80.safetensors', 'model-25-of-80.safetensors', 'model-14-of-80.safetensors', 'model-62-of-80.safetensors', 'model-65-of-80.safetensors', 'model-40-of-80.safetensors', 'model-73-of-80.safetensors', 'model-23-of-80.safetensors', 'model-15-of-80.safetensors', 'model-19-of-80.safetensors', 'model-43-of-80.safetensors', 'model-35-of-80.safetensors', 'model-47-of-80.safetensors', 'model-58-of-80.safetensors', 'model-18-of-80.safetensors', 'model-27-of-80.safetensors', 'model-17-of-80.safetensors', 'model-52-of-80.safetensors', 'model-55-of-80.safetensors', 'model-63-of-80.safetensors', 'model-34-of-80.safetensors', 'model-60-of-80.safetensors', 'model-64-of-80.safetensors', 'model-49-of-80.safetensors', 'model-50-of-80.safetensors', 'model-75-of-80.safetensors', 'model-69-of-80.safetensors', 'model-77-of-80.safetensors', 'model-48-of-80.safetensors', 'model-20-of-80.safetensors', 'model-41-of-80.safetensors', 'model-10-of-80.safetensors', 'model-70-of-80.safetensors', 'model-66-of-80.safetensors', 'model-37-of-80.safetensors', 'model-9-of-80.safetensors', 'model-54-of-80.safetensors']] -(Worker_TP0 pid=442) Loading safetensors checkpoint shards: 0% Completed | 0/80 [00:00 -(Worker_TP1 pid=443) DEBUG 04-22 01:34:04 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=444) DEBUG 04-22 01:34:04 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=443) DEBUG 04-22 01:34:04 [compilation/decorators.py:528] Start compiling function -(Worker_TP2 pid=444) DEBUG 04-22 01:34:04 [compilation/decorators.py:528] Start compiling function -(Worker_TP0 pid=442) DEBUG 04-22 01:34:04 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) DEBUG 04-22 01:34:04 [compilation/decorators.py:528] Start compiling function -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP0 pid=442) INFO 04-22 01:34:13 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/6cf3b9ba99/rank_0_0/backbone for vLLM's torch.compile -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=fc91086a3c comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/6cf3b9ba99/rank_0_0/backbone -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] Vllm config hash: fc91086a3c -(Worker_TP0 pid=442) INFO 04-22 01:34:13 [compilation/backends.py:1111] Dynamo bytecode transform time: 9.27 s -(Worker_TP0 pid=442) DEBUG 04-22 01:34:13 [compilation/.../fusion/allreduce_rms_fusion.py:765] Flashinfer max size: 2 MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: 128 -(Worker_TP0 pid=442) INFO 04-22 01:34:13 [distributed/device_communicators/flashinfer_all_reduce.py:109] Auto-selected flashinfer allreduce backend: trtllm -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=fc91086a3c comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/6cf3b9ba99/rank_1_0/backbone -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP1 pid=443) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] Vllm config hash: fc91086a3c -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=fc91086a3c comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/6cf3b9ba99/rank_2_0/backbone -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP2 pid=444) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] Vllm config hash: fc91086a3c -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=fc91086a3c comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/6cf3b9ba99/rank_3_0/backbone -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP3 pid=445) DEBUG 04-22 01:34:13 [compilation/backends.py:1074] Vllm config hash: fc91086a3c -(APIServer pid=1) DEBUG 04-22 01:34:14 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=442) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. -(Worker_TP0 pid=442) return func(*args, **kwargs) -(Worker_TP1 pid=443) DEBUG 04-22 01:34:14 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=1, max_token_num=128, hidden_dim=8192, dtype=torch.bfloat16 -(Worker_TP0 pid=442) DEBUG 04-22 01:34:14 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=0, max_token_num=128, hidden_dim=8192, dtype=torch.bfloat16 -(Worker_TP3 pid=445) DEBUG 04-22 01:34:14 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=3, max_token_num=128, hidden_dim=8192, dtype=torch.bfloat16 -(Worker_TP2 pid=444) DEBUG 04-22 01:34:14 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=2, max_token_num=128, hidden_dim=8192, dtype=torch.bfloat16 -(Worker_TP0 pid=442) INFO 04-22 01:34:14 [distributed/device_communicators/flashinfer_all_reduce.py:149] Initialized FlashInfer Allreduce norm fusion workspace with backend=trtllm -(Worker_TP0 pid=442) DEBUG 04-22 01:34:15 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 128), (129, 8192)] -(Worker_TP0 pid=442) DEBUG 04-22 01:34:15 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(Worker_TP0 pid=442) DEBUG 04-22 01:34:16 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=442) DEBUG 04-22 01:34:16 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP0 pid=442) DEBUG 04-22 01:34:16 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns -(Worker_TP0 pid=442) DEBUG 04-22 01:34:16 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 35.9 ms -(Worker_TP0 pid=442) DEBUG 04-22 01:34:16 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=442) DEBUG 04-22 01:34:16 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes -(Worker_TP0 pid=442) DEBUG 04-22 01:34:16 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP1 pid=443) DEBUG 04-22 01:34:16 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=443) DEBUG 04-22 01:34:16 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP2 pid=444) DEBUG 04-22 01:34:16 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP2 pid=444) DEBUG 04-22 01:34:16 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP1 pid=443) DEBUG 04-22 01:34:16 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns -(Worker_TP1 pid=443) DEBUG 04-22 01:34:16 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 35.5 ms -(Worker_TP1 pid=443) DEBUG 04-22 01:34:16 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=443) DEBUG 04-22 01:34:16 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes -(Worker_TP1 pid=443) DEBUG 04-22 01:34:16 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP3 pid=445) DEBUG 04-22 01:34:16 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP2 pid=444) DEBUG 04-22 01:34:16 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns -(Worker_TP3 pid=445) DEBUG 04-22 01:34:16 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms -(Worker_TP2 pid=444) DEBUG 04-22 01:34:16 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 36.5 ms -(Worker_TP2 pid=444) DEBUG 04-22 01:34:16 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP2 pid=444) DEBUG 04-22 01:34:16 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes -(Worker_TP2 pid=444) DEBUG 04-22 01:34:16 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP3 pid=445) DEBUG 04-22 01:34:16 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns -(Worker_TP3 pid=445) DEBUG 04-22 01:34:16 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.6 ms -(Worker_TP3 pid=445) DEBUG 04-22 01:34:16 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP3 pid=445) DEBUG 04-22 01:34:16 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes -(Worker_TP3 pid=445) DEBUG 04-22 01:34:16 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.7 ms -(Worker_TP0 pid=442) INFO 04-22 01:34:18 [compilation/backends.py:372] Cache the graph of compile range (1, 128) for later use -(Worker_TP0 pid=442) DEBUG 04-22 01:34:18 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 128) from inductor_standalone via handle ('artifact_compile_range_1_128_subgraph_0', '/data/.cache/vllm/torch_compile_cache/6cf3b9ba99/rank_0_0/backbone/artifact_compile_range_1_128_subgraph_0') -(Worker_TP0 pid=442) DEBUG 04-22 01:34:19 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=442) DEBUG 04-22 01:34:19 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP0 pid=442) DEBUG 04-22 01:34:19 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) -(Worker_TP0 pid=442) DEBUG 04-22 01:34:19 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP0 pid=442) DEBUG 04-22 01:34:19 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=442) DEBUG 04-22 01:34:19 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP2 pid=444) DEBUG 04-22 01:34:19 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP2 pid=444) DEBUG 04-22 01:34:19 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.1 ms -(Worker_TP2 pid=444) DEBUG 04-22 01:34:19 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) -(Worker_TP2 pid=444) DEBUG 04-22 01:34:19 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.5 ms -(Worker_TP2 pid=444) DEBUG 04-22 01:34:19 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP2 pid=444) DEBUG 04-22 01:34:19 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP3 pid=445) DEBUG 04-22 01:34:19 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP3 pid=445) DEBUG 04-22 01:34:19 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP3 pid=445) DEBUG 04-22 01:34:19 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) -(Worker_TP3 pid=445) DEBUG 04-22 01:34:19 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP3 pid=445) DEBUG 04-22 01:34:19 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP3 pid=445) DEBUG 04-22 01:34:19 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=442) INFO 04-22 01:34:20 [compilation/backends.py:372] Cache the graph of compile range (129, 8192) for later use -(Worker_TP0 pid=442) DEBUG 04-22 01:34:20 [compilation/backends.py:377] Store the 0-th graph for compile range(129, 8192) from inductor_standalone via handle ('artifact_compile_range_129_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/6cf3b9ba99/rank_0_0/backbone/artifact_compile_range_129_8192_subgraph_0') -(Worker_TP1 pid=443) DEBUG 04-22 01:34:20 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=443) DEBUG 04-22 01:34:20 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP1 pid=443) DEBUG 04-22 01:34:20 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) -(Worker_TP1 pid=443) DEBUG 04-22 01:34:20 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=443) DEBUG 04-22 01:34:20 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=443) DEBUG 04-22 01:34:20 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=442) DEBUG 04-22 01:34:20 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=442) DEBUG 04-22 01:34:20 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP0 pid=442) DEBUG 04-22 01:34:20 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP0 pid=442) DEBUG 04-22 01:34:20 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 65.8 ms -(Worker_TP0 pid=442) DEBUG 04-22 01:34:20 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=442) DEBUG 04-22 01:34:20 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP0 pid=442) DEBUG 04-22 01:34:20 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP2 pid=444) DEBUG 04-22 01:34:20 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP2 pid=444) DEBUG 04-22 01:34:20 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms -(Worker_TP2 pid=444) DEBUG 04-22 01:34:20 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP2 pid=444) DEBUG 04-22 01:34:20 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 66.3 ms -(Worker_TP2 pid=444) DEBUG 04-22 01:34:20 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP2 pid=444) DEBUG 04-22 01:34:20 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP2 pid=444) DEBUG 04-22 01:34:20 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.6 ms -(Worker_TP3 pid=445) DEBUG 04-22 01:34:20 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP3 pid=445) DEBUG 04-22 01:34:20 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP3 pid=445) DEBUG 04-22 01:34:20 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP3 pid=445) DEBUG 04-22 01:34:20 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 68.0 ms -(Worker_TP3 pid=445) DEBUG 04-22 01:34:20 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP3 pid=445) DEBUG 04-22 01:34:20 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP3 pid=445) DEBUG 04-22 01:34:20 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.6 ms -(Worker_TP0 pid=442) DEBUG 04-22 01:34:21 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 128) from inductor_standalone via handle ('artifact_compile_range_1_128_subgraph_1', '/data/.cache/vllm/torch_compile_cache/6cf3b9ba99/rank_0_0/backbone/artifact_compile_range_1_128_subgraph_1') -(Worker_TP1 pid=443) DEBUG 04-22 01:34:21 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=443) DEBUG 04-22 01:34:21 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP1 pid=443) DEBUG 04-22 01:34:21 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP1 pid=443) DEBUG 04-22 01:34:21 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 68.8 ms -(Worker_TP1 pid=443) DEBUG 04-22 01:34:21 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=443) DEBUG 04-22 01:34:21 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP1 pid=443) DEBUG 04-22 01:34:21 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP0 pid=442) DEBUG 04-22 01:34:21 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=442) DEBUG 04-22 01:34:21 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP0 pid=442) DEBUG 04-22 01:34:21 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) -(Worker_TP0 pid=442) DEBUG 04-22 01:34:21 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.7 ms -(Worker_TP0 pid=442) DEBUG 04-22 01:34:21 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=442) DEBUG 04-22 01:34:21 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP2 pid=444) DEBUG 04-22 01:34:21 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP2 pid=444) DEBUG 04-22 01:34:21 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP2 pid=444) DEBUG 04-22 01:34:21 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) -(Worker_TP2 pid=444) DEBUG 04-22 01:34:21 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP2 pid=444) DEBUG 04-22 01:34:21 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP2 pid=444) DEBUG 04-22 01:34:21 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP3 pid=445) DEBUG 04-22 01:34:21 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP3 pid=445) DEBUG 04-22 01:34:21 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP3 pid=445) DEBUG 04-22 01:34:21 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) -(Worker_TP3 pid=445) DEBUG 04-22 01:34:21 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP3 pid=445) DEBUG 04-22 01:34:21 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP3 pid=445) DEBUG 04-22 01:34:21 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP1 pid=443) DEBUG 04-22 01:34:22 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=443) DEBUG 04-22 01:34:22 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.1 ms -(Worker_TP1 pid=443) DEBUG 04-22 01:34:22 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) -(Worker_TP1 pid=443) DEBUG 04-22 01:34:22 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP1 pid=443) DEBUG 04-22 01:34:22 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=443) DEBUG 04-22 01:34:22 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=442) DEBUG 04-22 01:34:23 [compilation/backends.py:377] Store the 1-th graph for compile range(129, 8192) from inductor_standalone via handle ('artifact_compile_range_129_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/6cf3b9ba99/rank_0_0/backbone/artifact_compile_range_129_8192_subgraph_1') -(APIServer pid=1) DEBUG 04-22 01:34:24 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=442) DEBUG 04-22 01:34:29 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=442) DEBUG 04-22 01:34:29 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP0 pid=442) DEBUG 04-22 01:34:29 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP0 pid=442) DEBUG 04-22 01:34:29 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 67.7 ms -(Worker_TP0 pid=442) DEBUG 04-22 01:34:29 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP0 pid=442) DEBUG 04-22 01:34:29 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP0 pid=442) DEBUG 04-22 01:34:29 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=442) DEBUG 04-22 01:34:29 [compilation/backends.py:377] Store the 80-th graph for compile range(1, 128) from inductor_standalone via handle ('artifact_compile_range_1_128_subgraph_80', '/data/.cache/vllm/torch_compile_cache/6cf3b9ba99/rank_0_0/backbone/artifact_compile_range_1_128_subgraph_80') -(Worker_TP0 pid=442) INFO 04-22 01:34:29 [compilation/backends.py:390] Compiling a graph for compile range (1, 128) takes 10.62 s -(Worker_TP2 pid=444) DEBUG 04-22 01:34:29 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP2 pid=444) DEBUG 04-22 01:34:29 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms -(Worker_TP0 pid=442) DEBUG 04-22 01:34:29 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=442) DEBUG 04-22 01:34:29 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP0 pid=442) DEBUG 04-22 01:34:29 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) -(Worker_TP0 pid=442) DEBUG 04-22 01:34:29 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP0 pid=442) DEBUG 04-22 01:34:29 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=442) DEBUG 04-22 01:34:29 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(Worker_TP2 pid=444) DEBUG 04-22 01:34:29 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP2 pid=444) DEBUG 04-22 01:34:29 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 66.9 ms -(Worker_TP2 pid=444) DEBUG 04-22 01:34:29 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP2 pid=444) DEBUG 04-22 01:34:29 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP2 pid=444) DEBUG 04-22 01:34:29 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP2 pid=444) DEBUG 04-22 01:34:30 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP2 pid=444) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP2 pid=444) DEBUG 04-22 01:34:30 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) -(Worker_TP2 pid=444) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP2 pid=444) DEBUG 04-22 01:34:30 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP2 pid=444) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP0 pid=442) DEBUG 04-22 01:34:30 [compilation/backends.py:377] Store the 80-th graph for compile range(129, 8192) from inductor_standalone via handle ('artifact_compile_range_129_8192_subgraph_80', '/data/.cache/vllm/torch_compile_cache/6cf3b9ba99/rank_0_0/backbone/artifact_compile_range_129_8192_subgraph_80') -(Worker_TP0 pid=442) INFO 04-22 01:34:30 [compilation/backends.py:390] Compiling a graph for compile range (129, 8192) takes 11.54 s -(Worker_TP1 pid=443) DEBUG 04-22 01:34:30 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=443) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP3 pid=445) DEBUG 04-22 01:34:30 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP3 pid=445) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP1 pid=443) DEBUG 04-22 01:34:30 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP1 pid=443) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 67.8 ms -(Worker_TP1 pid=443) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms -(Worker_TP1 pid=443) DEBUG 04-22 01:34:30 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP1 pid=443) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP3 pid=445) DEBUG 04-22 01:34:30 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP3 pid=445) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 65.4 ms -(Worker_TP3 pid=445) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP3 pid=445) DEBUG 04-22 01:34:30 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP3 pid=445) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=442) DEBUG 04-22 01:34:30 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/6cf3b9ba99/rank_0_0/backbone/computation_graph.py -(Worker_TP1 pid=443) DEBUG 04-22 01:34:30 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=443) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(Worker_TP1 pid=443) DEBUG 04-22 01:34:30 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) -(Worker_TP1 pid=443) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.6 ms -(Worker_TP1 pid=443) DEBUG 04-22 01:34:30 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=443) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(Worker_TP3 pid=445) DEBUG 04-22 01:34:30 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP3 pid=445) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP3 pid=445) DEBUG 04-22 01:34:30 [compilation/passes/pass_manager.py:100] Skipping with compile range (129, 8192) -(Worker_TP3 pid=445) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP3 pid=445) DEBUG 04-22 01:34:30 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP3 pid=445) DEBUG 04-22 01:34:30 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP0 pid=442) INFO 04-22 01:34:33 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/0fb79f10c474cc267fe2b1b1648d8efdb738b7e56acfe7e43c9b23ddaa70d3c4/rank_0_0/model -(Worker_TP0 pid=442) INFO 04-22 01:34:33 [compilation/monitor.py:48] torch.compile took 28.92 s in total -(APIServer pid=1) DEBUG 04-22 01:34:34 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=442) INFO 04-22 01:34:35 [compilation/monitor.py:76] Initial profiling/warmup run took 2.25 s -(Worker_TP0 pid=442) INFO 04-22 01:34:41 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP0 pid=442) DEBUG 04-22 01:34:41 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP0 pid=442) INFO 04-22 01:34:41 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_TP3 pid=445) INFO 04-22 01:34:41 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP3 pid=445) DEBUG 04-22 01:34:41 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP3 pid=445) INFO 04-22 01:34:41 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_TP1 pid=443) INFO 04-22 01:34:41 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP1 pid=443) DEBUG 04-22 01:34:41 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP1 pid=443) INFO 04-22 01:34:41 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_TP0 pid=442) DEBUG 04-22 01:34:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=445) DEBUG 04-22 01:34:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=443) DEBUG 04-22 01:34:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=444) INFO 04-22 01:34:42 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP2 pid=444) DEBUG 04-22 01:34:42 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP2 pid=444) INFO 04-22 01:34:42 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_TP2 pid=444) DEBUG 04-22 01:34:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) DEBUG 04-22 01:34:43 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP3 pid=445) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=445) DEBUG 04-22 01:34:43 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=443) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=443) DEBUG 04-22 01:34:43 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP2 pid=444) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=444) DEBUG 04-22 01:34:43 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=442) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=445) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=443) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=444) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) DEBUG 04-22 01:34:43 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP3 pid=445) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=445) DEBUG 04-22 01:34:43 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=443) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=443) DEBUG 04-22 01:34:43 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP2 pid=444) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=444) DEBUG 04-22 01:34:43 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=442) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 132.00 MiB first-capture + (51-1) × 16.00 MiB per-graph -(Worker_TP0 pid=442) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=445) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 132.00 MiB first-capture + (51-1) × 16.00 MiB per-graph -(Worker_TP3 pid=445) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=443) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 132.00 MiB first-capture + (51-1) × 16.00 MiB per-graph -(Worker_TP1 pid=443) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=444) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 132.00 MiB first-capture + (51-1) × 16.00 MiB per-graph -(Worker_TP2 pid=444) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=443) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=445) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=443) DEBUG 04-22 01:34:43 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=442) DEBUG 04-22 01:34:43 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP3 pid=445) DEBUG 04-22 01:34:43 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP2 pid=444) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=444) DEBUG 04-22 01:34:43 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP3 pid=445) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=443) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=444) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=445) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=445) DEBUG 04-22 01:34:43 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=442) DEBUG 04-22 01:34:43 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=443) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=443) DEBUG 04-22 01:34:43 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP2 pid=444) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=444) DEBUG 04-22 01:34:43 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=442) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 142.00 MiB first-capture + (51-1) × 12.00 MiB per-graph -(Worker_TP3 pid=445) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 142.00 MiB first-capture + (51-1) × 12.00 MiB per-graph -(Worker_TP0 pid=442) INFO 04-22 01:34:43 [distributed/device_communicators/custom_all_reduce.py:216] Registering 322 cuda graph addresses -(Worker_TP3 pid=445) INFO 04-22 01:34:43 [distributed/device_communicators/custom_all_reduce.py:216] Registering 322 cuda graph addresses -(Worker_TP1 pid=443) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 142.00 MiB first-capture + (51-1) × 12.00 MiB per-graph -(Worker_TP1 pid=443) INFO 04-22 01:34:43 [distributed/device_communicators/custom_all_reduce.py:216] Registering 322 cuda graph addresses -(Worker_TP2 pid=444) DEBUG 04-22 01:34:43 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 142.00 MiB first-capture + (51-1) × 12.00 MiB per-graph -(Worker_TP2 pid=444) INFO 04-22 01:34:43 [distributed/device_communicators/custom_all_reduce.py:216] Registering 322 cuda graph addresses -(APIServer pid=1) DEBUG 04-22 01:34:44 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=442) DEBUG 04-22 01:34:44 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP0 pid=442) INFO 04-22 01:34:44 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.51 GiB total -(Worker_TP1 pid=443) DEBUG 04-22 01:34:44 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP1 pid=443) INFO 04-22 01:34:44 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.51 GiB total -(Worker_TP3 pid=445) DEBUG 04-22 01:34:44 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP3 pid=445) INFO 04-22 01:34:44 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.51 GiB total -(Worker_TP2 pid=444) DEBUG 04-22 01:34:44 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP2 pid=444) INFO 04-22 01:34:44 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.51 GiB total -(Worker_TP0 pid=442) DEBUG 04-22 01:34:44 [v1/worker/gpu_worker.py:424] Initial free memory: 77.64 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP0 pid=442) DEBUG 04-22 01:34:44 [v1/worker/gpu_worker.py:430] Free memory after profiling: 40.54 GiB (total), 38.13 GiB (within requested) -(Worker_TP0 pid=442) DEBUG 04-22 01:34:44 [v1/worker/gpu_worker.py:435] Memory profiling takes 40.40 seconds. Total non KV cache memory: 38.51GiB; torch peak memory increase: 2.29GiB; non-torch forward increase memory: 2.21GiB; weights memory: 34.01GiB. -(Worker_TP0 pid=442) INFO 04-22 01:34:44 [v1/worker/gpu_worker.py:436] Available KV cache memory: 36.72 GiB -(Worker_TP0 pid=442) INFO 04-22 01:34:44 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9690 to maintain the same effective KV cache size. -(Worker_TP0 pid=442) DEBUG 04-22 01:34:44 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=243) DEBUG 04-22 01:34:44 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=243) DEBUG 04-22 01:34:44 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP3 pid=445) DEBUG 04-22 01:34:44 [v1/worker/gpu_worker.py:424] Initial free memory: 77.64 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP3 pid=445) DEBUG 04-22 01:34:44 [v1/worker/gpu_worker.py:430] Free memory after profiling: 40.54 GiB (total), 38.13 GiB (within requested) -(Worker_TP3 pid=445) DEBUG 04-22 01:34:44 [v1/worker/gpu_worker.py:435] Memory profiling takes 40.46 seconds. Total non KV cache memory: 38.51GiB; torch peak memory increase: 2.29GiB; non-torch forward increase memory: 2.21GiB; weights memory: 34.01GiB. -(Worker_TP3 pid=445) INFO 04-22 01:34:44 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9690 to maintain the same effective KV cache size. -(Worker_TP3 pid=445) DEBUG 04-22 01:34:44 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=443) DEBUG 04-22 01:34:44 [v1/worker/gpu_worker.py:424] Initial free memory: 77.64 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP1 pid=443) DEBUG 04-22 01:34:44 [v1/worker/gpu_worker.py:430] Free memory after profiling: 40.54 GiB (total), 38.13 GiB (within requested) -(Worker_TP1 pid=443) DEBUG 04-22 01:34:44 [v1/worker/gpu_worker.py:435] Memory profiling takes 40.43 seconds. Total non KV cache memory: 38.51GiB; torch peak memory increase: 2.29GiB; non-torch forward increase memory: 2.21GiB; weights memory: 34.01GiB. -(Worker_TP1 pid=443) INFO 04-22 01:34:44 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9690 to maintain the same effective KV cache size. -(Worker_TP1 pid=443) DEBUG 04-22 01:34:44 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=243) DEBUG 04-22 01:34:44 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=243) DEBUG 04-22 01:34:44 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP2 pid=444) DEBUG 04-22 01:34:44 [v1/worker/gpu_worker.py:424] Initial free memory: 77.64 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP2 pid=444) DEBUG 04-22 01:34:44 [v1/worker/gpu_worker.py:430] Free memory after profiling: 40.54 GiB (total), 38.13 GiB (within requested) -(Worker_TP2 pid=444) DEBUG 04-22 01:34:44 [v1/worker/gpu_worker.py:435] Memory profiling takes 40.54 seconds. Total non KV cache memory: 38.51GiB; torch peak memory increase: 2.29GiB; non-torch forward increase memory: 2.21GiB; weights memory: 34.01GiB. -(Worker_TP2 pid=444) INFO 04-22 01:34:44 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9690 to maintain the same effective KV cache size. -(Worker_TP2 pid=444) DEBUG 04-22 01:34:44 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=243) DEBUG 04-22 01:34:44 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=243) INFO 04-22 01:34:44 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 481,296 tokens -(EngineCore pid=243) INFO 04-22 01:34:44 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 58.75x -(Worker_TP3 pid=445) DEBUG 04-22 01:34:44 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP2 pid=444) DEBUG 04-22 01:34:44 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=442) DEBUG 04-22 01:34:44 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=443) DEBUG 04-22 01:34:44 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP3 pid=445) 2026-04-22 01:34:44,946 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP1 pid=443) 2026-04-22 01:34:44,947 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP0 pid=442) 2026-04-22 01:34:44,947 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP2 pid=444) 2026-04-22 01:34:44,950 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP3 pid=445) DEBUG 04-22 01:34:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=443) DEBUG 04-22 01:34:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=444) DEBUG 04-22 01:34:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) DEBUG 04-22 01:34:44 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=445) 2026-04-22 01:34:45,053 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP0 pid=442) 2026-04-22 01:34:45,053 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP1 pid=443) 2026-04-22 01:34:45,053 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP2 pid=444) 2026-04-22 01:34:45,053 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP3 pid=445) DEBUG 04-22 01:34:45 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=442) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=243) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=243) INFO 04-22 01:34:57 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(Worker_TP1 pid=443) DEBUG 04-22 01:34:57 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=442) DEBUG 04-22 01:34:57 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP2 pid=444) DEBUG 04-22 01:34:57 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP3 pid=445) DEBUG 04-22 01:34:57 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=243) DEBUG 04-22 01:34:57 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=243) DEBUG 04-22 01:34:57 [v1/engine/core.py:1158] EngineCore waiting for work. -(EngineCore pid=243) DEBUG 04-22 01:34:57 [v1/engine/core.py:1158] EngineCore waiting for work. -(APIServer pid=1) INFO 04-22 01:34:57 [entrypoints/openai/api_server.py:590] Supported tasks: ['generate'] -(Worker_TP3 pid=445) DEBUG 04-22 01:34:58 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=442) DEBUG 04-22 01:34:58 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP2 pid=444) DEBUG 04-22 01:34:58 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=443) DEBUG 04-22 01:34:58 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(APIServer pid=1) DEBUG 04-22 01:34:59 [renderers/base.py:197] Warming up chat template processing... -(APIServer pid=1) DEBUG 04-22 01:34:59 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e825c3-262cbbf84c1596706b620aed;550b368f-0e76-44a4-a224-60910a22ce88) -(APIServer pid=1) DEBUG 04-22 01:34:59 [transformers_utils/repo_utils.py:243] -(APIServer pid=1) DEBUG 04-22 01:34:59 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/moonshotai/Kimi-Dev-72B/resolve/main/processor_config.json. -(APIServer pid=1) DEBUG 04-22 01:34:59 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e825c3-70eafc0c3e5c2e900043d0c7;f53b034b-cfa3-49f9-be82-3d60dba5c7b9) -(APIServer pid=1) DEBUG 04-22 01:34:59 [transformers_utils/repo_utils.py:243] -(APIServer pid=1) DEBUG 04-22 01:34:59 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/moonshotai/Kimi-Dev-72B/resolve/main/preprocessor_config.json. -(APIServer pid=1) INFO 04-22 01:35:01 [renderers/hf.py:314] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this. -(APIServer pid=1) DEBUG 04-22 01:35:01 [renderers/base.py:203] Chat template warmup completed in 1.285s -(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/openai/api_server.py:594] Starting vLLM server on http://0.0.0.0:8000 -(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:37] Available routes are: -(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /openapi.json, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /docs, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /docs/oauth2-redirect, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /redoc, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /tokenize, Methods: POST -(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /detokenize, Methods: POST -(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /load, Methods: GET -(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /version, Methods: GET -(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /health, Methods: GET -(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /metrics, Methods: GET -(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /v1/models, Methods: GET -(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /ping, Methods: GET -(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /ping, Methods: POST -(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /invocations, Methods: POST -(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /v1/chat/completions, Methods: POST -(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST -(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /v1/responses, Methods: POST -(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET -(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST -(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /v1/completions, Methods: POST -(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /v1/messages, Methods: POST -(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST -(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /inference/v1/generate, Methods: POST -(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /scale_elastic_ep, Methods: POST -(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST -(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /v1/chat/completions/render, Methods: POST -(APIServer pid=1) INFO 04-22 01:35:02 [entrypoints/launcher.py:46] Route: /v1/completions/render, Methods: POST -(APIServer pid=1) INFO: Started server process [1] -(APIServer pid=1) INFO: Waiting for application startup. -(APIServer pid=1) INFO: Application startup complete. -(APIServer pid=1) DEBUG 04-22 01:35:09 [v1/engine/async_llm.py:875] Called check_health. -(APIServer pid=1) INFO: 10.131.2.2:45350 - "GET /health HTTP/1.1" 200 OK diff --git a/accuracy/results/v0.19.0/logs/moonshotai-kimi-vl-a3b-i--h100-80gb--tp1pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/moonshotai-kimi-vl-a3b-i--h100-80gb--tp1pp1dp1--8192.log deleted file mode 100644 index ee8f5d9f..00000000 --- a/accuracy/results/v0.19.0/logs/moonshotai-kimi-vl-a3b-i--h100-80gb--tp1pp1dp1--8192.log +++ /dev/null @@ -1,1126 +0,0 @@ -DEBUG 04-21 23:53:06 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-21 23:53:06 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-21 23:53:06 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-21 23:53:06 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-21 23:53:06 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-21 23:53:06 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-21 23:53:06 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-21 23:53:06 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-21 23:53:06 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-21 23:53:06 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-21 23:53:06 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-21 23:53:06 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-21 23:53:11 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-21 23:53:13 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' -DEBUG 04-21 23:53:13 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-21 23:53:13 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-21 23:53:13 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-21 23:53:13 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-21 23:53:13 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-21 23:53:13 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-21 23:53:13 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-21 23:53:13 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model moonshotai/Kimi-VL-A3B-Instruct -(APIServer pid=1) INFO 04-21 23:53:13 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-21 23:53:13 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-21 23:53:13 [entrypoints/utils.py:233] non-default args: {'model_tag': 'moonshotai/Kimi-VL-A3B-Instruct', 'model': 'moonshotai/Kimi-VL-A3B-Instruct', 'trust_remote_code': True, 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-21 23:53:13 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-21 23:53:14 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.kimi_vl.KimiVLForConditionalGeneration from cache -(APIServer pid=1) DEBUG 04-21 23:53:14 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0025763 secs -(APIServer pid=1) INFO 04-21 23:53:14 [config/model.py:549] Resolved architecture: KimiVLForConditionalGeneration -(APIServer pid=1) INFO 04-21 23:53:14 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-21 23:53:14 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-21 23:53:14 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-21 23:53:14 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-21 23:53:14 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-21 23:53:14 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-21 23:53:14 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-21 23:53:14 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-21 23:53:14 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-21 23:53:14 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) WARNING 04-21 23:53:14 [tokenizers/registry.py:212] Using a slow tokenizer. This might cause a significant slowdown. Consider using a fast tokenizer instead. -(APIServer pid=1) DEBUG 04-21 23:53:14 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-21 23:53:15 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. -(APIServer pid=1) DEBUG 04-21 23:53:15 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -(APIServer pid=1) DEBUG 04-21 23:53:15 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e80deb-1ca2657603acf53a72029f01;435fb308-1aeb-440e-8511-57fefe40e375) -(APIServer pid=1) DEBUG 04-21 23:53:15 [transformers_utils/repo_utils.py:243] -(APIServer pid=1) DEBUG 04-21 23:53:15 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/resolve/main/processor_config.json. -(APIServer pid=1) DEBUG 04-21 23:53:16 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e80dec-4e4380984c55a25e0c831aa6;ff27d06d-bd40-4e2d-b896-41d657dab11e) -(APIServer pid=1) DEBUG 04-21 23:53:16 [transformers_utils/repo_utils.py:243] -(APIServer pid=1) DEBUG 04-21 23:53:16 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/resolve/main/processor_config.json. -DEBUG 04-21 23:53:23 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-21 23:53:23 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-21 23:53:23 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-21 23:53:23 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-21 23:53:23 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-21 23:53:23 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-21 23:53:23 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-21 23:53:23 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-21 23:53:23 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-21 23:53:23 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-21 23:53:23 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-21 23:53:23 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-21 23:53:28 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=244) DEBUG 04-21 23:53:29 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-21 23:53:29 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=244) DEBUG 04-21 23:53:29 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/5f55a193-a5dc-4b3c-8486-b927e244d0cc'], outputs=['ipc:///tmp/2769faea-c662-424e-993f-d1ea27de3c9c'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=244) DEBUG 04-21 23:53:29 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=244) DEBUG 04-21 23:53:29 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=244) DEBUG 04-21 23:53:29 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=244) DEBUG 04-21 23:53:29 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=244) DEBUG 04-21 23:53:29 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=244) INFO 04-21 23:53:29 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='moonshotai/Kimi-VL-A3B-Instruct', speculative_config=None, tokenizer='moonshotai/Kimi-VL-A3B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=moonshotai/Kimi-VL-A3B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=244) DEBUG 04-21 23:53:30 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(EngineCore pid=244) WARNING 04-21 23:53:30 [tokenizers/registry.py:212] Using a slow tokenizer. This might cause a significant slowdown. Consider using a fast tokenizer instead. -(EngineCore pid=244) DEBUG 04-21 23:53:31 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.19:46795 backend=nccl -(EngineCore pid=244) INFO 04-21 23:53:31 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.19:46795 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=244) DEBUG 04-21 23:53:31 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=244) INFO 04-21 23:53:31 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N/A -(EngineCore pid=244) DEBUG 04-21 23:53:31 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776815611.7651086, auto_measure=True -(EngineCore pid=244) DEBUG 04-21 23:53:31 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=244) DEBUG 04-21 23:53:31 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=244) DEBUG 04-21 23:53:31 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=244) DEBUG 04-21 23:53:31 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=244) DEBUG 04-21 23:53:31 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=244) DEBUG 04-21 23:53:31 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=244) DEBUG 04-21 23:53:31 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=244) DEBUG 04-21 23:53:31 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. -(EngineCore pid=244) DEBUG 04-21 23:53:32 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e80dfb-656344db4555b931477bec0a;0deda71c-ac0d-4025-a3eb-3d9396dea2c6) -(EngineCore pid=244) DEBUG 04-21 23:53:32 [transformers_utils/repo_utils.py:243] -(EngineCore pid=244) DEBUG 04-21 23:53:32 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/resolve/main/processor_config.json. -(EngineCore pid=244) DEBUG 04-21 23:53:34 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e80dfe-3feddb5218e61836715685f8;7c2e9747-00f7-4f1c-900e-19c1d1d4f167) -(EngineCore pid=244) DEBUG 04-21 23:53:34 [transformers_utils/repo_utils.py:243] -(EngineCore pid=244) DEBUG 04-21 23:53:34 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/resolve/main/processor_config.json. -(EngineCore pid=244) DEBUG 04-21 23:53:38 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=244) INFO 04-21 23:53:38 [v1/worker/gpu_model_runner.py:4735] Starting to load model moonshotai/Kimi-VL-A3B-Instruct... -(EngineCore pid=244) INFO 04-21 23:53:38 [platforms/cuda.py:390] Using backend AttentionBackendEnum.FLASH_ATTN for vit attention -(EngineCore pid=244) INFO 04-21 23:53:38 [model_executor/.../attention/mm_encoder_attention.py:230] Using AttentionBackendEnum.FLASH_ATTN for MMEncoderAttention. -(EngineCore pid=244) INFO 04-21 23:53:38 [config/vllm.py:790] Asynchronous scheduling is enabled. -(EngineCore pid=244) DEBUG 04-21 23:53:39 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=576, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=True, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {FLASHINFER_MLA: [compute capability not supported], FLASHMLA_SPARSE: [non-sparse not supported]}. -(EngineCore pid=244) INFO 04-21 23:53:39 [platforms/cuda.py:334] Using FLASH_ATTN_MLA attention backend out of potential backends: ['FLASH_ATTN_MLA', 'FLASHMLA', 'TRITON_MLA']. -(EngineCore pid=244) INFO 04-21 23:53:39 [model_executor/.../attention/mla_attention.py:2137] Using FlashAttention prefill for MLA -(EngineCore pid=244) INFO 04-21 23:53:39 [utils/deep_gemm.py:115] DeepGEMM E8M0 enabled on current platform. -(EngineCore pid=244) INFO 04-21 23:53:39 [model_executor/.../oracle/unquantized.py:186] Using TRITON backend for Unquantized MoE -(EngineCore pid=244) DEBUG 04-21 23:53:39 [model_executor/.../runner/default_moe_runner.py:240] Enabled separate cuda stream for MoE shared_experts -(EngineCore pid=244) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=244) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=244) DEBUG 04-21 23:53:39 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=244) DEBUG 04-21 23:53:39 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 27}) -(EngineCore pid=244) DEBUG 04-21 23:53:39 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 82, 'mla_decode_concat_quant_fp8': 27, 'silu_and_mul': 27, 'fused_moe': 26, 'unquantized_fused_moe': 26, 'conv2d': 1, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=244) DEBUG 04-21 23:53:39 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 27}) -(EngineCore pid=244) DEBUG 04-21 23:53:39 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 82, 'mla_decode_concat_quant_fp8': 27, 'silu_and_mul': 27, 'fused_moe': 26, 'unquantized_fused_moe': 26, 'conv2d': 1, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=244) DEBUG 04-21 23:53:39 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=244) DEBUG 04-21 23:53:39 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00006-of-00007.safetensors', 'model-00007-of-00007.safetensors', 'model-00004-of-00007.safetensors', 'model-00002-of-00007.safetensors', 'model-00001-of-00007.safetensors', 'model-00005-of-00007.safetensors', 'model-00003-of-00007.safetensors']] -(EngineCore pid=244) Loading safetensors checkpoint shards: 0% Completed | 0/7 [00:00 -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/forward_context.py -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/mla_attention.py -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/config.py -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/shared_fused_moe.py -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/mla.py -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/deepseek_v2.py -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=244) INFO 04-21 23:54:14 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/b96ff633a9/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=632e15ae31 comp=e546579c48 code=9292e945a1cdc87ebcb24cd65543d194ea60732523e9e0a6221751ff295bdfed dir=/data/.cache/vllm/torch_compile_cache/b96ff633a9/rank_0_0/backbone -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=244) DEBUG 04-21 23:54:14 [compilation/backends.py:1074] Vllm config hash: 632e15ae31 -(EngineCore pid=244) INFO 04-21 23:54:15 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.34 s -(EngineCore pid=244) DEBUG 04-21 23:54:15 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=244) DEBUG 04-21 23:54:15 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=244) DEBUG 04-21 23:54:15 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=244) DEBUG 04-21 23:54:15 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(EngineCore pid=244) DEBUG 04-21 23:54:15 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(EngineCore pid=244) DEBUG 04-21 23:54:15 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=244) DEBUG 04-21 23:54:15 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=244) INFO 04-21 23:54:18 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=244) DEBUG 04-21 23:54:18 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/b96ff633a9/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=244) DEBUG 04-21 23:54:18 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=244) DEBUG 04-21 23:54:18 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(EngineCore pid=244) DEBUG 04-21 23:54:18 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(EngineCore pid=244) DEBUG 04-21 23:54:18 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=244) DEBUG 04-21 23:54:18 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(APIServer pid=1) DEBUG 04-21 23:54:19 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=244) DEBUG 04-21 23:54:20 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/b96ff633a9/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=244) DEBUG 04-21 23:54:20 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=244) DEBUG 04-21 23:54:20 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(EngineCore pid=244) DEBUG 04-21 23:54:20 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(EngineCore pid=244) DEBUG 04-21 23:54:20 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=244) DEBUG 04-21 23:54:20 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=244) DEBUG 04-21 23:54:22 [compilation/backends.py:377] Store the 2-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_2', '/data/.cache/vllm/torch_compile_cache/b96ff633a9/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_2') -(EngineCore pid=244) DEBUG 04-21 23:54:23 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=244) DEBUG 04-21 23:54:23 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(EngineCore pid=244) DEBUG 04-21 23:54:23 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(EngineCore pid=244) DEBUG 04-21 23:54:23 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=244) DEBUG 04-21 23:54:23 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=244) DEBUG 04-21 23:54:23 [compilation/backends.py:377] Store the 27-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_27', '/data/.cache/vllm/torch_compile_cache/b96ff633a9/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_27') -(EngineCore pid=244) INFO 04-21 23:54:23 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 8.30 s -(EngineCore pid=244) DEBUG 04-21 23:54:23 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/b96ff633a9/rank_0_0/backbone/computation_graph.py -(EngineCore pid=244) INFO 04-21 23:54:24 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/27aaa92cd5a8b1d618310af6363b4ccd68035f614eff82150e0efa0993b9fb9d/rank_0_0/model -(EngineCore pid=244) INFO 04-21 23:54:24 [compilation/monitor.py:48] torch.compile took 13.76 s in total -(EngineCore pid=244) WARNING 04-21 23:54:25 [model_executor/.../fused_moe/fused_moe.py:1090] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=64,N=1408,device_name=NVIDIA_H100_80GB_HBM3.json -(EngineCore pid=244) INFO 04-21 23:54:26 [compilation/monitor.py:76] Initial profiling/warmup run took 1.94 s -(APIServer pid=1) DEBUG 04-21 23:54:29 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=244) INFO 04-21 23:54:31 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=244) DEBUG 04-21 23:54:31 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=244) INFO 04-21 23:54:31 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=244) DEBUG 04-21 23:54:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-21 23:54:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-21 23:54:32 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-21 23:54:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-21 23:54:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-21 23:54:32 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-21 23:54:32 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 106.00 MiB first-capture + (51-1) × 8.00 MiB per-graph -(EngineCore pid=244) DEBUG 04-21 23:54:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-21 23:54:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-21 23:54:32 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-21 23:54:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-21 23:54:32 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-21 23:54:32 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-21 23:54:32 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 522.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=244) DEBUG 04-21 23:54:33 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=244) INFO 04-21 23:54:33 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.19 GiB total -(EngineCore pid=244) DEBUG 04-21 23:54:33 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=244) DEBUG 04-21 23:54:33 [v1/worker/gpu_worker.py:430] Free memory after profiling: 46.83 GiB (total), 43.38 GiB (within requested) -(EngineCore pid=244) DEBUG 04-21 23:54:33 [v1/worker/gpu_worker.py:435] Memory profiling takes 23.31 seconds. Total non KV cache memory: 33.91GiB; torch peak memory increase: 2.92GiB; non-torch forward increase memory: 0.25GiB; weights memory: 30.74GiB. -(EngineCore pid=244) INFO 04-21 23:54:33 [v1/worker/gpu_worker.py:436] Available KV cache memory: 41.32 GiB -(EngineCore pid=244) INFO 04-21 23:54:33 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9651 to maintain the same effective KV cache size. -(EngineCore pid=244) INFO 04-21 23:54:33 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 1,426,368 tokens -(EngineCore pid=244) INFO 04-21 23:54:33 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 174.12x -(EngineCore pid=244) 2026-04-21 23:54:33,586 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=244) DEBUG 04-21 23:54:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) 2026-04-21 23:54:33,615 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=244) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-21 23:57:07 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-21 23:57:07 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-21 23:57:07 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-21 23:57:07 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-21 23:57:07 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-21 23:57:07 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model moonshotai/Kimi-VL-A3B-Instruct -(APIServer pid=1) INFO 04-21 23:57:07 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-21 23:57:07 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-21 23:57:07 [entrypoints/utils.py:233] non-default args: {'model_tag': 'moonshotai/Kimi-VL-A3B-Instruct', 'model': 'moonshotai/Kimi-VL-A3B-Instruct', 'trust_remote_code': True, 'max_model_len': 8192, 'tensor_parallel_size': 2, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-21 23:57:07 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-21 23:57:08 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.kimi_vl.KimiVLForConditionalGeneration from cache -(APIServer pid=1) DEBUG 04-21 23:57:08 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0034862 secs -(APIServer pid=1) INFO 04-21 23:57:08 [config/model.py:549] Resolved architecture: KimiVLForConditionalGeneration -(APIServer pid=1) INFO 04-21 23:57:08 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-21 23:57:08 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-21 23:57:08 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-21 23:57:08 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-21 23:57:08 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-21 23:57:08 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-21 23:57:08 [config/parallel.py:743] Defaulting to use mp for distributed inference -(APIServer pid=1) INFO 04-21 23:57:08 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-21 23:57:08 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(APIServer pid=1) DEBUG 04-21 23:57:08 [config/vllm.py:1530] Max num batched tokens below allreduce-rms fusion threshold, allreduce-rms fusion will be enabled for all num_tokens. -(APIServer pid=1) INFO 04-21 23:57:08 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(APIServer pid=1) DEBUG 04-21 23:57:09 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-21 23:57:09 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) WARNING 04-21 23:57:09 [tokenizers/registry.py:212] Using a slow tokenizer. This might cause a significant slowdown. Consider using a fast tokenizer instead. -(APIServer pid=1) DEBUG 04-21 23:57:09 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-21 23:57:09 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. -(APIServer pid=1) DEBUG 04-21 23:57:09 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -(APIServer pid=1) DEBUG 04-21 23:57:10 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e80ed6-087dc9aa3871621f3872a3a4;1e736337-4cc0-4549-a520-1c575d029903) -(APIServer pid=1) DEBUG 04-21 23:57:10 [transformers_utils/repo_utils.py:243] -(APIServer pid=1) DEBUG 04-21 23:57:10 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/resolve/main/processor_config.json. -(APIServer pid=1) DEBUG 04-21 23:57:11 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e80ed7-0f06928f1da9c20c258d924c;e4bb89c5-ea75-4fdc-87a3-765fff118610) -(APIServer pid=1) DEBUG 04-21 23:57:11 [transformers_utils/repo_utils.py:243] -(APIServer pid=1) DEBUG 04-21 23:57:11 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/resolve/main/processor_config.json. -DEBUG 04-21 23:57:18 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-21 23:57:18 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-21 23:57:18 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-21 23:57:18 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-21 23:57:18 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-21 23:57:18 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-21 23:57:18 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-21 23:57:18 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-21 23:57:18 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-21 23:57:18 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-21 23:57:18 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-21 23:57:18 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-21 23:57:23 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(APIServer pid=1) DEBUG 04-21 23:57:24 [v1/engine/utils.py:1042] Waiting for 1 local, 0 remote core engine proc(s) to connect. -(EngineCore pid=245) DEBUG 04-21 23:57:24 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-21 23:57:24 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=245) DEBUG 04-21 23:57:24 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/53a8b05b-abcf-4e7e-aee7-b70e26a95d27'], outputs=['ipc:///tmp/8179865b-8008-423f-9410-4cde8ed0a4ec'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=245) DEBUG 04-21 23:57:24 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=245) DEBUG 04-21 23:57:24 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=245) DEBUG 04-21 23:57:24 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=245) DEBUG 04-21 23:57:24 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=245) DEBUG 04-21 23:57:24 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=245) INFO 04-21 23:57:24 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='moonshotai/Kimi-VL-A3B-Instruct', speculative_config=None, tokenizer='moonshotai/Kimi-VL-A3B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=moonshotai/Kimi-VL-A3B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=245) WARNING 04-21 23:57:24 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. -(EngineCore pid=245) INFO 04-21 23:57:24 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.128.4.181 (local), world_size=2, local_world_size=2 -(EngineCore pid=245) DEBUG 04-21 23:57:24 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/209d9e2f-5438-4f2f-868c-5e831de3572e -(EngineCore pid=245) DEBUG 04-21 23:57:24 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1], buffer_handle=(2, 16777216, 10, 'psm_9b94b597'), local_subscribe_addr='ipc:///tmp/209d9e2f-5438-4f2f-868c-5e831de3572e', local_notify_addr='ipc:///tmp/e98e7316-2b82-4fa7-8b00-5dc7a6284217', remote_subscribe_addr=None, remote_addr_ipv6=False) -DEBUG 04-21 23:57:28 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-21 23:57:28 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-21 23:57:28 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-21 23:57:28 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-21 23:57:28 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-21 23:57:28 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-21 23:57:28 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-21 23:57:28 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-21 23:57:28 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-21 23:57:28 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-21 23:57:28 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-21 23:57:28 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-21 23:57:28 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-21 23:57:28 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-21 23:57:28 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-21 23:57:28 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-21 23:57:28 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-21 23:57:28 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-21 23:57:28 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-21 23:57:28 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-21 23:57:28 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-21 23:57:28 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-21 23:57:28 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-21 23:57:28 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-21 23:57:33 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-21 23:57:33 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-21 23:57:34 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-21 23:57:34 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-21 23:57:34 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-21 23:57:34 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-21 23:57:34 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-21 23:57:34 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-21 23:57:34 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-21 23:57:34 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) DEBUG 04-21 23:57:34 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -DEBUG 04-21 23:57:35 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -DEBUG 04-21 23:57:35 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -WARNING 04-21 23:57:35 [tokenizers/registry.py:212] Using a slow tokenizer. This might cause a significant slowdown. Consider using a fast tokenizer instead. -(Worker pid=444) DEBUG 04-21 23:57:35 [distributed/parallel_state.py:1356] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:55183 backend=nccl -(Worker pid=444) INFO 04-21 23:57:35 [distributed/parallel_state.py:1400] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:55183 backend=nccl -WARNING 04-21 23:57:36 [tokenizers/registry.py:212] Using a slow tokenizer. This might cause a significant slowdown. Consider using a fast tokenizer instead. -(Worker pid=445) DEBUG 04-21 23:57:36 [distributed/parallel_state.py:1356] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:55183 backend=nccl -(Worker pid=445) INFO 04-21 23:57:36 [distributed/parallel_state.py:1400] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:55183 backend=nccl -[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -(Worker pid=445) DEBUG 04-21 23:57:36 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=444) DEBUG 04-21 23:57:36 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -(Worker pid=445) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=445) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=444) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=444) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=444) DEBUG 04-21 23:57:36 [utils/nccl.py:34] Found nccl from library libnccl.so.2 -(Worker pid=444) INFO 04-21 23:57:36 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 -(Worker pid=445) DEBUG 04-21 23:57:37 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=444) DEBUG 04-21 23:57:37 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=444) DEBUG 04-21 23:57:37 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/4634f212-25f1-4a79-bbe7-2d22deb8d514 -(Worker pid=444) DEBUG 04-21 23:57:37 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1], buffer_handle=(1, 4194304, 6, 'psm_eba5827f'), local_subscribe_addr='ipc:///tmp/4634f212-25f1-4a79-bbe7-2d22deb8d514', local_notify_addr='ipc:///tmp/7a2e1266-15b1-4565-b1ec-33a83d7f84c1', remote_subscribe_addr=None, remote_addr_ipv6=False) -(Worker pid=445) DEBUG 04-21 23:57:37 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/4634f212-25f1-4a79-bbe7-2d22deb8d514 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -(Worker pid=444) INFO 04-21 23:57:37 [distributed/parallel_state.py:1716] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N/A -(Worker pid=445) DEBUG 04-21 23:57:38 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.18GiB, total_memory=79.19GiB, cuda_memory=2.01GiB, torch_memory=0.02GiB, non_torch_memory=1.99GiB, timestamp=1776815858.1115682, auto_measure=True -(Worker pid=445) DEBUG 04-21 23:57:38 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=444) DEBUG 04-21 23:57:38 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.18GiB, total_memory=79.19GiB, cuda_memory=2.01GiB, torch_memory=0.02GiB, non_torch_memory=1.99GiB, timestamp=1776815858.153387, auto_measure=True -(Worker pid=444) DEBUG 04-21 23:57:38 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=445) DEBUG 04-21 23:57:38 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=445) DEBUG 04-21 23:57:38 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=445) DEBUG 04-21 23:57:38 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=445) DEBUG 04-21 23:57:38 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=444) DEBUG 04-21 23:57:38 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=444) DEBUG 04-21 23:57:38 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=444) DEBUG 04-21 23:57:38 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=445) DEBUG 04-21 23:57:38 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=444) DEBUG 04-21 23:57:38 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=444) DEBUG 04-21 23:57:38 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(Worker pid=444) DEBUG 04-21 23:57:38 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=445) DEBUG 04-21 23:57:38 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e80ef2-2dea93487f2b57286bbd2791;5057faf0-b328-471e-acd5-73c6b4cd8a8b) -(Worker pid=445) DEBUG 04-21 23:57:38 [transformers_utils/repo_utils.py:243] -(Worker pid=445) DEBUG 04-21 23:57:38 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/resolve/main/processor_config.json. -(Worker pid=444) DEBUG 04-21 23:57:38 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e80ef2-1a640bbc17e245a301bebbd5;373adc4d-7871-4c01-bc7f-8ef75479df0c) -(Worker pid=444) DEBUG 04-21 23:57:38 [transformers_utils/repo_utils.py:243] -(Worker pid=444) DEBUG 04-21 23:57:38 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/resolve/main/processor_config.json. -(Worker pid=445) DEBUG 04-21 23:57:41 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e80ef5-30f4eb86103918785d3573ed;74e46153-3902-467b-ac2e-3792e551f1e7) -(Worker pid=445) DEBUG 04-21 23:57:41 [transformers_utils/repo_utils.py:243] -(Worker pid=445) DEBUG 04-21 23:57:41 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/resolve/main/processor_config.json. -(Worker pid=444) DEBUG 04-21 23:57:41 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e80ef5-19441e254ef6204f13830997;3069dd3a-dab8-4503-98dc-c508429e3cd6) -(Worker pid=444) DEBUG 04-21 23:57:41 [transformers_utils/repo_utils.py:243] -(Worker pid=444) DEBUG 04-21 23:57:41 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/resolve/main/processor_config.json. -(Worker_TP1 pid=445) DEBUG 04-21 23:57:44 [config/vllm.py:1530] Max num batched tokens below allreduce-rms fusion threshold, allreduce-rms fusion will be enabled for all num_tokens. -(Worker pid=444) DEBUG 04-21 23:57:44 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(Worker_TP0 pid=444) INFO 04-21 23:57:44 [v1/worker/gpu_model_runner.py:4735] Starting to load model moonshotai/Kimi-VL-A3B-Instruct... -(Worker_TP1 pid=445) DEBUG 04-21 23:57:44 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP1 pid=445) DEBUG 04-21 23:57:44 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 27}) -(Worker_TP1 pid=445) DEBUG 04-21 23:57:44 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 82, 'mla_decode_concat_quant_fp8': 27, 'silu_and_mul': 27, 'fused_moe': 26, 'unquantized_fused_moe': 26, 'conv2d': 1, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP1 pid=445) DEBUG 04-21 23:57:44 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 27}) -(Worker_TP1 pid=445) DEBUG 04-21 23:57:44 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 82, 'mla_decode_concat_quant_fp8': 27, 'silu_and_mul': 27, 'fused_moe': 26, 'unquantized_fused_moe': 26, 'conv2d': 1, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP1 pid=445) DEBUG 04-21 23:57:44 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP0 pid=444) INFO 04-21 23:57:44 [platforms/cuda.py:390] Using backend AttentionBackendEnum.FLASH_ATTN for vit attention -(Worker_TP0 pid=444) INFO 04-21 23:57:44 [model_executor/.../attention/mm_encoder_attention.py:230] Using AttentionBackendEnum.FLASH_ATTN for MMEncoderAttention. -(Worker_TP0 pid=444) INFO 04-21 23:57:44 [config/vllm.py:790] Asynchronous scheduling is enabled. -(Worker_TP0 pid=444) DEBUG 04-21 23:57:44 [config/vllm.py:1530] Max num batched tokens below allreduce-rms fusion threshold, allreduce-rms fusion will be enabled for all num_tokens. -(Worker_TP0 pid=444) INFO 04-21 23:57:44 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(Worker_TP1 pid=445) DEBUG 04-21 23:57:44 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00007-of-00007.safetensors', 'model-00002-of-00007.safetensors', 'model-00006-of-00007.safetensors', 'model-00001-of-00007.safetensors', 'model-00004-of-00007.safetensors', 'model-00003-of-00007.safetensors', 'model-00005-of-00007.safetensors']] -(Worker_TP0 pid=444) DEBUG 04-21 23:57:44 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=576, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=True, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {FLASHINFER_MLA: [compute capability not supported], FLASHMLA_SPARSE: [non-sparse not supported]}. -(Worker_TP0 pid=444) INFO 04-21 23:57:44 [platforms/cuda.py:334] Using FLASH_ATTN_MLA attention backend out of potential backends: ['FLASH_ATTN_MLA', 'FLASHMLA', 'TRITON_MLA']. -(Worker_TP0 pid=444) INFO 04-21 23:57:44 [model_executor/.../attention/mla_attention.py:2137] Using FlashAttention prefill for MLA -(Worker_TP0 pid=444) INFO 04-21 23:57:44 [utils/deep_gemm.py:115] DeepGEMM E8M0 enabled on current platform. -(Worker_TP0 pid=444) INFO 04-21 23:57:44 [model_executor/.../oracle/unquantized.py:186] Using TRITON backend for Unquantized MoE -(Worker_TP0 pid=444) DEBUG 04-21 23:57:44 [model_executor/.../runner/default_moe_runner.py:240] Enabled separate cuda stream for MoE shared_experts -(APIServer pid=1) DEBUG 04-21 23:57:44 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=444) DEBUG 04-21 23:57:44 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP0 pid=444) DEBUG 04-21 23:57:44 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 27}) -(Worker_TP0 pid=444) DEBUG 04-21 23:57:44 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 82, 'mla_decode_concat_quant_fp8': 27, 'silu_and_mul': 27, 'fused_moe': 26, 'unquantized_fused_moe': 26, 'conv2d': 1, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP0 pid=444) DEBUG 04-21 23:57:44 [config/compilation.py:1194] enabled custom ops: Counter({'mm_encoder_attn': 27}) -(Worker_TP0 pid=444) DEBUG 04-21 23:57:44 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 82, 'mla_decode_concat_quant_fp8': 27, 'silu_and_mul': 27, 'fused_moe': 26, 'unquantized_fused_moe': 26, 'conv2d': 1, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP0 pid=444) DEBUG 04-21 23:57:44 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP0 pid=444) DEBUG 04-21 23:57:45 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00003-of-00007.safetensors', 'model-00001-of-00007.safetensors', 'model-00007-of-00007.safetensors', 'model-00004-of-00007.safetensors', 'model-00006-of-00007.safetensors', 'model-00002-of-00007.safetensors', 'model-00005-of-00007.safetensors']] -(Worker_TP0 pid=444) Loading safetensors checkpoint shards: 0% Completed | 0/7 [00:00 -(Worker_TP1 pid=445) DEBUG 04-21 23:58:13 [compilation/decorators.py:528] Start compiling function -(EngineCore pid=245) DEBUG 04-21 23:58:13 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(APIServer pid=1) DEBUG 04-21 23:58:14 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/forward_context.py -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/mla_attention.py -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/config.py -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/shared_fused_moe.py -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/mla.py -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/deepseek_v2.py -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/transformers/configuration_utils.py -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/forward_context.py -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/mla_attention.py -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/config.py -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/shared_fused_moe.py -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/mla.py -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/deepseek_v2.py -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=d8fa51fca8 comp=e546579c48 code=a8ee4e07d8cfb16ff10c7a91bbf306af44d6b5cf8dda22a08a2b5e657ca5d21c dir=/data/.cache/vllm/torch_compile_cache/68d54dc2af/rank_1_0/backbone -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP1 pid=445) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] Vllm config hash: d8fa51fca8 -(Worker_TP0 pid=444) INFO 04-21 23:58:17 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/68d54dc2af/rank_0_0/backbone for vLLM's torch.compile -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=d8fa51fca8 comp=e546579c48 code=a8ee4e07d8cfb16ff10c7a91bbf306af44d6b5cf8dda22a08a2b5e657ca5d21c dir=/data/.cache/vllm/torch_compile_cache/68d54dc2af/rank_0_0/backbone -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/backends.py:1074] Vllm config hash: d8fa51fca8 -(Worker_TP0 pid=444) INFO 04-21 23:58:17 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.28 s -(Worker_TP0 pid=444) DEBUG 04-21 23:58:17 [compilation/.../fusion/allreduce_rms_fusion.py:765] Flashinfer max size: 64 MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: 8192 -(Worker_TP0 pid=444) INFO 04-21 23:58:17 [distributed/device_communicators/flashinfer_all_reduce.py:109] Auto-selected flashinfer allreduce backend: trtllm -(Worker_TP0 pid=444) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. -(Worker_TP0 pid=444) return func(*args, **kwargs) -(Worker_TP0 pid=444) DEBUG 04-21 23:58:18 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=0, max_token_num=8192, hidden_dim=2048, dtype=torch.bfloat16 -(Worker_TP1 pid=445) DEBUG 04-21 23:58:18 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=1, max_token_num=8192, hidden_dim=2048, dtype=torch.bfloat16 -(Worker_TP0 pid=444) INFO 04-21 23:58:18 [distributed/device_communicators/flashinfer_all_reduce.py:149] Initialized FlashInfer Allreduce norm fusion workspace with backend=trtllm -(Worker_TP0 pid=444) DEBUG 04-21 23:58:18 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(Worker_TP0 pid=444) DEBUG 04-21 23:58:18 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(Worker_TP0 pid=444) DEBUG 04-21 23:58:18 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=444) DEBUG 04-21 23:58:18 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP0 pid=444) DEBUG 04-21 23:58:18 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 0 patterns -(Worker_TP0 pid=444) DEBUG 04-21 23:58:18 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 1.7 ms -(Worker_TP0 pid=444) DEBUG 04-21 23:58:18 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=444) DEBUG 04-21 23:58:18 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=444) DEBUG 04-21 23:58:18 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(Worker_TP1 pid=445) DEBUG 04-21 23:58:18 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=445) DEBUG 04-21 23:58:18 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP1 pid=445) DEBUG 04-21 23:58:18 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 0 patterns -(Worker_TP1 pid=445) DEBUG 04-21 23:58:18 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 1.6 ms -(Worker_TP1 pid=445) DEBUG 04-21 23:58:18 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=445) DEBUG 04-21 23:58:18 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=445) DEBUG 04-21 23:58:18 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(Worker_TP0 pid=444) INFO 04-21 23:58:21 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(Worker_TP0 pid=444) DEBUG 04-21 23:58:21 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/68d54dc2af/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(Worker_TP0 pid=444) DEBUG 04-21 23:58:21 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=444) DEBUG 04-21 23:58:21 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP1 pid=445) DEBUG 04-21 23:58:21 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=445) DEBUG 04-21 23:58:21 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms -(Worker_TP0 pid=444) DEBUG 04-21 23:58:21 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP0 pid=444) DEBUG 04-21 23:58:21 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 41.4 ms -(Worker_TP0 pid=444) DEBUG 04-21 23:58:21 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=444) DEBUG 04-21 23:58:21 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP0 pid=444) DEBUG 04-21 23:58:21 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP1 pid=445) DEBUG 04-21 23:58:21 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP1 pid=445) DEBUG 04-21 23:58:21 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 41.9 ms -(Worker_TP1 pid=445) DEBUG 04-21 23:58:21 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=445) DEBUG 04-21 23:58:21 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP1 pid=445) DEBUG 04-21 23:58:21 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP0 pid=444) DEBUG 04-21 23:58:22 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/68d54dc2af/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(Worker_TP0 pid=444) DEBUG 04-21 23:58:23 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=444) DEBUG 04-21 23:58:23 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP1 pid=445) DEBUG 04-21 23:58:23 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=445) DEBUG 04-21 23:58:23 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP0 pid=444) DEBUG 04-21 23:58:23 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP0 pid=444) DEBUG 04-21 23:58:23 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.0 ms -(Worker_TP0 pid=444) DEBUG 04-21 23:58:23 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=444) DEBUG 04-21 23:58:23 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP0 pid=444) DEBUG 04-21 23:58:23 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP1 pid=445) DEBUG 04-21 23:58:23 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP1 pid=445) DEBUG 04-21 23:58:23 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 38.5 ms -(Worker_TP1 pid=445) DEBUG 04-21 23:58:23 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=445) DEBUG 04-21 23:58:23 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP1 pid=445) DEBUG 04-21 23:58:23 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP0 pid=444) DEBUG 04-21 23:58:23 [compilation/backends.py:377] Store the 2-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_2', '/data/.cache/vllm/torch_compile_cache/68d54dc2af/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_2') -(Worker_TP0 pid=444) DEBUG 04-21 23:58:24 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=444) DEBUG 04-21 23:58:24 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(Worker_TP1 pid=445) DEBUG 04-21 23:58:24 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=445) DEBUG 04-21 23:58:24 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(Worker_TP0 pid=444) DEBUG 04-21 23:58:24 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP0 pid=444) DEBUG 04-21 23:58:24 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 35.5 ms -(Worker_TP0 pid=444) DEBUG 04-21 23:58:24 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms -(Worker_TP0 pid=444) DEBUG 04-21 23:58:24 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP0 pid=444) DEBUG 04-21 23:58:24 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP1 pid=445) DEBUG 04-21 23:58:24 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP1 pid=445) DEBUG 04-21 23:58:24 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 38.2 ms -(Worker_TP1 pid=445) DEBUG 04-21 23:58:24 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms -(Worker_TP1 pid=445) DEBUG 04-21 23:58:24 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP1 pid=445) DEBUG 04-21 23:58:24 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP0 pid=444) DEBUG 04-21 23:58:24 [compilation/backends.py:377] Store the 27-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_27', '/data/.cache/vllm/torch_compile_cache/68d54dc2af/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_27') -(Worker_TP0 pid=444) INFO 04-21 23:58:24 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 6.48 s -(APIServer pid=1) DEBUG 04-21 23:58:24 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=444) DEBUG 04-21 23:58:24 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/68d54dc2af/rank_0_0/backbone/computation_graph.py -(Worker_TP0 pid=444) INFO 04-21 23:58:25 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/b88d4c6cebccb530609b324c5e6df85458bab759d77baaef56518e181c625670/rank_0_0/model -(Worker_TP0 pid=444) INFO 04-21 23:58:25 [compilation/monitor.py:48] torch.compile took 12.23 s in total -(Worker_TP0 pid=444) WARNING 04-21 23:58:27 [model_executor/.../fused_moe/fused_moe.py:1090] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=64,N=704,device_name=NVIDIA_H100_80GB_HBM3.json -(Worker_TP0 pid=444) INFO 04-21 23:58:27 [compilation/monitor.py:76] Initial profiling/warmup run took 2.16 s -(Worker_TP1 pid=445) INFO 04-21 23:58:33 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP1 pid=445) DEBUG 04-21 23:58:33 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP1 pid=445) INFO 04-21 23:58:33 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_TP0 pid=444) INFO 04-21 23:58:33 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP0 pid=444) DEBUG 04-21 23:58:33 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP0 pid=444) INFO 04-21 23:58:33 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_TP0 pid=444) DEBUG 04-21 23:58:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-21 23:58:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-21 23:58:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-21 23:58:33 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-21 23:58:33 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=445) DEBUG 04-21 23:58:33 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=445) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-21 23:58:34 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=444) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-21 23:58:34 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=445) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 106.00 MiB first-capture + (51-1) × 8.00 MiB per-graph -(Worker_TP1 pid=445) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 106.00 MiB first-capture + (51-1) × 8.00 MiB per-graph -(Worker_TP0 pid=444) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-21 23:58:34 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=444) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-21 23:58:34 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=445) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-21 23:58:34 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=444) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-21 23:58:34 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=445) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 264.00 MiB first-capture + (51-1) × 8.00 MiB per-graph -(Worker_TP1 pid=445) INFO 04-21 23:58:34 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses -(Worker_TP0 pid=444) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 264.00 MiB first-capture + (51-1) × 8.00 MiB per-graph -(Worker_TP0 pid=444) INFO 04-21 23:58:34 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses -(APIServer pid=1) DEBUG 04-21 23:58:34 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP1 pid=445) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP1 pid=445) INFO 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.04 GiB total -(Worker_TP0 pid=444) DEBUG 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP0 pid=444) INFO 04-21 23:58:34 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.04 GiB total -(Worker_TP1 pid=445) DEBUG 04-21 23:58:35 [v1/worker/gpu_worker.py:424] Initial free memory: 77.18 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP1 pid=445) DEBUG 04-21 23:58:35 [v1/worker/gpu_worker.py:430] Free memory after profiling: 58.93 GiB (total), 56.97 GiB (within requested) -(Worker_TP1 pid=445) DEBUG 04-21 23:58:35 [v1/worker/gpu_worker.py:435] Memory profiling takes 22.13 seconds. Total non KV cache memory: 20.45GiB; torch peak memory increase: 2.85GiB; non-torch forward increase memory: 2.07GiB; weights memory: 15.53GiB. -(Worker_TP1 pid=445) INFO 04-21 23:58:35 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9631 to maintain the same effective KV cache size. -(Worker_TP1 pid=445) DEBUG 04-21 23:58:35 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=444) DEBUG 04-21 23:58:35 [v1/worker/gpu_worker.py:424] Initial free memory: 77.18 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP0 pid=444) DEBUG 04-21 23:58:35 [v1/worker/gpu_worker.py:430] Free memory after profiling: 58.93 GiB (total), 56.97 GiB (within requested) -(Worker_TP0 pid=444) DEBUG 04-21 23:58:35 [v1/worker/gpu_worker.py:435] Memory profiling takes 22.13 seconds. Total non KV cache memory: 20.45GiB; torch peak memory increase: 2.85GiB; non-torch forward increase memory: 2.07GiB; weights memory: 15.53GiB. -(Worker_TP0 pid=444) INFO 04-21 23:58:35 [v1/worker/gpu_worker.py:436] Available KV cache memory: 54.78 GiB -(Worker_TP0 pid=444) INFO 04-21 23:58:35 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9631 to maintain the same effective KV cache size. -(Worker_TP0 pid=444) DEBUG 04-21 23:58:35 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=245) DEBUG 04-21 23:58:35 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=245) INFO 04-21 23:58:35 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 1,890,896 tokens -(EngineCore pid=245) INFO 04-21 23:58:35 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 230.82x -(Worker_TP0 pid=444) DEBUG 04-21 23:58:35 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=445) DEBUG 04-21 23:58:35 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=445) 2026-04-21 23:58:35,346 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP0 pid=444) 2026-04-21 23:58:35,346 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP1 pid=445) DEBUG 04-21 23:58:35 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-21 23:58:35 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) 2026-04-21 23:58:35,380 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP1 pid=445) 2026-04-21 23:58:35,380 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP1 pid=445) DEBUG 04-21 23:58:35 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-21 23:58:35 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-21 23:58:35 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=444) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=245) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=245) DEBUG 04-21 23:58:53 [config/vllm.py:1530] Max num batched tokens below allreduce-rms fusion threshold, allreduce-rms fusion will be enabled for all num_tokens. -(EngineCore pid=245) INFO 04-21 23:58:53 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(Worker_TP0 pid=444) DEBUG 04-21 23:58:53 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=445) DEBUG 04-21 23:58:53 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=245) DEBUG 04-21 23:58:53 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=245) DEBUG 04-21 23:58:53 [v1/engine/core.py:1158] EngineCore waiting for work. -(EngineCore pid=245) DEBUG 04-21 23:58:53 [v1/engine/core.py:1158] EngineCore waiting for work. -(APIServer pid=1) INFO 04-21 23:58:53 [entrypoints/openai/api_server.py:590] Supported tasks: ['generate'] -(APIServer pid=1) WARNING 04-21 23:58:53 [config/model.py:1435] Default vLLM sampling parameters have been overridden by the model's `generation_config.json`: `{'temperature': 0.2}`. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`. -(APIServer pid=1) DEBUG 04-21 23:58:54 [renderers/base.py:197] Warming up chat template processing... -(APIServer pid=1) DEBUG 04-21 23:58:54 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e80f3e-31013c70468c9b6a657b0454;72e067e6-2118-4306-9a44-d0fff2b7b1f5) -(APIServer pid=1) DEBUG 04-21 23:58:54 [transformers_utils/repo_utils.py:243] -(APIServer pid=1) DEBUG 04-21 23:58:54 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/resolve/main/processor_config.json. -(Worker_TP1 pid=445) DEBUG 04-21 23:58:54 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=444) DEBUG 04-21 23:58:54 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(APIServer pid=1) INFO 04-21 23:58:55 [renderers/hf.py:314] Detected the chat template content format to be 'openai'. You can set `--chat-template-content-format` to override this. -(APIServer pid=1) DEBUG 04-21 23:58:55 [renderers/base.py:203] Chat template warmup completed in 1.683s -(APIServer pid=1) DEBUG 04-21 23:58:55 [renderers/base.py:218] Warming up multi-modal processing... -(APIServer pid=1) DEBUG 04-21 23:58:55 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e80f3f-79c7822c760ebf2346048bfc;6a44fe89-171a-4323-ae01-fded739ec56a) -(APIServer pid=1) DEBUG 04-21 23:58:55 [transformers_utils/repo_utils.py:243] -(APIServer pid=1) DEBUG 04-21 23:58:55 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct/resolve/main/processor_config.json. -(APIServer pid=1) INFO 04-21 23:58:58 [renderers/base.py:231] Multi-modal warmup completed in 2.691s -(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/openai/api_server.py:594] Starting vLLM server on http://0.0.0.0:8000 -(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:37] Available routes are: -(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /openapi.json, Methods: HEAD, GET -(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /docs, Methods: HEAD, GET -(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /docs/oauth2-redirect, Methods: HEAD, GET -(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /redoc, Methods: HEAD, GET -(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /tokenize, Methods: POST -(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /detokenize, Methods: POST -(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /load, Methods: GET -(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /version, Methods: GET -(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /health, Methods: GET -(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /metrics, Methods: GET -(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /v1/models, Methods: GET -(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /ping, Methods: GET -(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /ping, Methods: POST -(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /invocations, Methods: POST -(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /v1/chat/completions, Methods: POST -(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST -(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /v1/responses, Methods: POST -(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET -(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST -(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /v1/completions, Methods: POST -(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /v1/messages, Methods: POST -(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST -(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /inference/v1/generate, Methods: POST -(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /scale_elastic_ep, Methods: POST -(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST -(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /v1/chat/completions/render, Methods: POST -(APIServer pid=1) INFO 04-21 23:58:58 [entrypoints/launcher.py:46] Route: /v1/completions/render, Methods: POST -(APIServer pid=1) INFO: Started server process [1] -(APIServer pid=1) INFO: Waiting for application startup. -(APIServer pid=1) INFO: Application startup complete. -(APIServer pid=1) DEBUG 04-21 23:59:03 [v1/engine/async_llm.py:875] Called check_health. -(APIServer pid=1) INFO: 10.128.4.2:57898 - "GET /health HTTP/1.1" 200 OK diff --git a/accuracy/results/v0.19.0/logs/openai-gpt-oss-20b--h100-80gb--tp1pp1dp1--8192.FAILED.log b/accuracy/results/v0.19.0/logs/openai-gpt-oss-20b--h100-80gb--tp1pp1dp1--8192.FAILED.log deleted file mode 100644 index 0d44cb0c..00000000 --- a/accuracy/results/v0.19.0/logs/openai-gpt-oss-20b--h100-80gb--tp1pp1dp1--8192.FAILED.log +++ /dev/null @@ -1,1112 +0,0 @@ -DEBUG 04-22 00:58:18 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:58:18 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:58:18 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:58:18 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:58:18 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:58:18 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:58:18 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:58:18 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:58:18 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:58:18 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:58:18 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:58:18 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:58:23 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 00:58:25 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' -DEBUG 04-22 00:58:25 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 00:58:25 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:58:25 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:58:25 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 00:58:25 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:58:25 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 00:58:25 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 00:58:25 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model openai/gpt-oss-20b -(APIServer pid=1) INFO 04-22 00:58:25 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 00:58:25 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:58:25 [entrypoints/utils.py:233] non-default args: {'model_tag': 'openai/gpt-oss-20b', 'model': 'openai/gpt-oss-20b', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 00:58:25 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 00:58:25 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.gpt_oss.GptOssForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 00:58:25 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0013268 secs -(APIServer pid=1) INFO 04-22 00:58:25 [config/model.py:549] Resolved architecture: GptOssForCausalLM -(APIServer pid=1) Parse safetensors files: 0%| | 0/3 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=248) DEBUG 04-22 00:58:37 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=248) DEBUG 04-22 00:58:37 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=248) INFO 04-22 00:58:37 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='openai/gpt-oss-20b', speculative_config=None, tokenizer='openai/gpt-oss-20b', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=mxfp4, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='openai_gptoss', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=openai/gpt-oss-20b, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 1024, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=248) DEBUG 04-22 00:58:38 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.7.34:47009 backend=nccl -(EngineCore pid=248) INFO 04-22 00:58:38 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.7.34:47009 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=248) DEBUG 04-22 00:58:38 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=248) INFO 04-22 00:58:38 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N/A -(EngineCore pid=248) DEBUG 04-22 00:58:38 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776819518.7695065, auto_measure=True -(EngineCore pid=248) DEBUG 04-22 00:58:38 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=248) DEBUG 04-22 00:58:38 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=248) DEBUG 04-22 00:58:38 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=248) DEBUG 04-22 00:58:38 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=248) DEBUG 04-22 00:58:38 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=248) DEBUG 04-22 00:58:38 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=248) DEBUG 04-22 00:58:38 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=248) DEBUG 04-22 00:58:38 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=248) DEBUG 04-22 00:58:38 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=248) INFO 04-22 00:58:38 [v1/worker/gpu_model_runner.py:4735] Starting to load model openai/gpt-oss-20b... -(EngineCore pid=248) DEBUG 04-22 00:58:39 [model_executor/.../quantization/mxfp4.py:75] MXFP4 linear layer is not implemented - falling back to UnquantizedLinearMethod. -(EngineCore pid=248) DEBUG 04-22 00:58:39 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=64, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=True, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {FLASHINFER: [attention sinks not supported], FLEX_ATTENTION: [attention sinks not supported]}. -(EngineCore pid=248) INFO 04-22 00:58:39 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'TRITON_ATTN']. -(EngineCore pid=248) INFO 04-22 00:58:39 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=248) DEBUG 04-22 00:58:39 [model_executor/.../quantization/mxfp4.py:84] MXFP4 attention layer is not implemented. Skipping quantization for this layer. -(EngineCore pid=248) DEBUG 04-22 00:58:39 [model_executor/.../oracle/mxfp4.py:355] Mxfp4 MoE backend 'FLASHINFER_TRTLLM_MXFP4_BF16' does not support the deployment configuration since kernel does not support current device cuda. -(EngineCore pid=248) DEBUG 04-22 00:58:39 [model_executor/.../oracle/mxfp4.py:355] Mxfp4 MoE backend 'CK' does not support the deployment configuration since kernel does not support current device cuda. -(EngineCore pid=248) INFO 04-22 00:58:39 [model_executor/.../oracle/mxfp4.py:352] Using 'TRITON' Mxfp4 MoE backend. -(EngineCore pid=248) DEBUG 04-22 00:58:39 [model_executor/.../runner/default_moe_runner.py:240] Enabled separate cuda stream for MoE shared_experts -(EngineCore pid=248) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=248) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=248) DEBUG 04-22 00:58:39 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=248) DEBUG 04-22 00:58:39 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=248) DEBUG 04-22 00:58:39 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 49, 'fused_moe': 24, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=248) DEBUG 04-22 00:58:39 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=248) DEBUG 04-22 00:58:39 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00000-of-00002.safetensors', 'model-00001-of-00002.safetensors', 'model-00002-of-00002.safetensors']] -(EngineCore pid=248) Loading safetensors checkpoint shards: 0% Completed | 0/3 [00:00 -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/forward_context.py -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/config.py -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/router/gate_linear.py -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/mxfp4.py -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/gpt_oss.py -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=248) INFO 04-22 00:58:57 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/529e03afeb/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=740e479b7e comp=e546579c48 code=0467e540cccd21fe4f6a70bb62bff632298390212396e8c7305d702bcfcff1ab dir=/data/.cache/vllm/torch_compile_cache/529e03afeb/rank_0_0/backbone -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=248) DEBUG 04-22 00:58:57 [compilation/backends.py:1074] Vllm config hash: 740e479b7e -(EngineCore pid=248) INFO 04-22 00:58:57 [compilation/backends.py:1111] Dynamo bytecode transform time: 3.32 s -(APIServer pid=1) DEBUG 04-22 00:58:57 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=248) DEBUG 04-22 00:58:58 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=248) DEBUG 04-22 00:58:58 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=248) DEBUG 04-22 00:58:58 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=248) DEBUG 04-22 00:58:58 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(EngineCore pid=248) DEBUG 04-22 00:58:58 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.7 ms -(EngineCore pid=248) DEBUG 04-22 00:58:58 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=248) DEBUG 04-22 00:58:58 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=248) INFO 04-22 00:59:00 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=248) DEBUG 04-22 00:59:00 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/529e03afeb/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=248) DEBUG 04-22 00:59:01 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=248) DEBUG 04-22 00:59:01 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms -(EngineCore pid=248) DEBUG 04-22 00:59:01 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.0 ms -(EngineCore pid=248) DEBUG 04-22 00:59:01 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=248) DEBUG 04-22 00:59:01 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=248) DEBUG 04-22 00:59:02 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/529e03afeb/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=248) DEBUG 04-22 00:59:02 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=248) DEBUG 04-22 00:59:02 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(EngineCore pid=248) DEBUG 04-22 00:59:02 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.7 ms -(EngineCore pid=248) DEBUG 04-22 00:59:02 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=248) DEBUG 04-22 00:59:02 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(EngineCore pid=248) DEBUG 04-22 00:59:02 [compilation/backends.py:377] Store the 24-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_24', '/data/.cache/vllm/torch_compile_cache/529e03afeb/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_24') -(EngineCore pid=248) INFO 04-22 00:59:02 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 4.61 s -(EngineCore pid=248) DEBUG 04-22 00:59:02 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/529e03afeb/rank_0_0/backbone/computation_graph.py -(EngineCore pid=248) INFO 04-22 00:59:04 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/fa25af138620590d2fed8e4ac24c59bbe55ddebfc4568c415c2ad3cab596cd1a/rank_0_0/model -(EngineCore pid=248) INFO 04-22 00:59:04 [compilation/monitor.py:48] torch.compile took 9.54 s in total -(APIServer pid=1) DEBUG 04-22 00:59:07 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=248) INFO 04-22 00:59:08 [compilation/monitor.py:76] Initial profiling/warmup run took 4.77 s -(EngineCore pid=248) INFO 04-22 00:59:14 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=1024 -(EngineCore pid=248) DEBUG 04-22 00:59:14 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=248) INFO 04-22 00:59:14 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=83 (largest=1024), FULL=83 (largest=1024) -(EngineCore pid=248) DEBUG 04-22 00:59:14 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=248) DEBUG 04-22 00:59:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=248) DEBUG 04-22 00:59:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=1024, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=248) DEBUG 04-22 00:59:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=248) DEBUG 04-22 00:59:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=248) DEBUG 04-22 00:59:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=1008, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=248) DEBUG 04-22 00:59:15 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 186.00 MiB first-capture + (83-1) × 6.00 MiB per-graph -(EngineCore pid=248) DEBUG 04-22 00:59:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=248) DEBUG 04-22 00:59:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=248) DEBUG 04-22 00:59:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=1024, num_reqs=1024, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=248) DEBUG 04-22 00:59:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=248) DEBUG 04-22 00:59:15 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=248) DEBUG 04-22 00:59:15 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=1008, num_reqs=1008, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=248) DEBUG 04-22 00:59:15 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 6.00 MiB first-capture + (83-1) × 8.00 MiB per-graph -(EngineCore pid=248) DEBUG 04-22 00:59:16 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=248) INFO 04-22 00:59:16 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.30 GiB total -(EngineCore pid=248) DEBUG 04-22 00:59:16 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=248) DEBUG 04-22 00:59:16 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.77 GiB (total), 59.33 GiB (within requested) -(EngineCore pid=248) DEBUG 04-22 00:59:16 [v1/worker/gpu_worker.py:435] Memory profiling takes 21.74 seconds. Total non KV cache memory: 16.76GiB; torch peak memory increase: 2.87GiB; non-torch forward increase memory: 0.25GiB; weights memory: 13.64GiB. -(EngineCore pid=248) INFO 04-22 00:59:16 [v1/worker/gpu_worker.py:436] Available KV cache memory: 58.47 GiB -(EngineCore pid=248) INFO 04-22 00:59:16 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9665 to maintain the same effective KV cache size. -(EngineCore pid=248) INFO 04-22 00:59:16 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 1,277,184 tokens -(EngineCore pid=248) INFO 04-22 00:59:16 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 155.75x -(EngineCore pid=248) 2026-04-22 00:59:16,275 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=248) DEBUG 04-22 00:59:16 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=248) 2026-04-22 00:59:16,316 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=248) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/83 [00:00 -(APIServer pid=1) sys.exit(main()) -(APIServer pid=1) ^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main -(APIServer pid=1) args.dispatch_function(args) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd -(APIServer pid=1) uvloop.run(run_server(args)) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run -(APIServer pid=1) return __asyncio.run( -(APIServer pid=1) ^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run -(APIServer pid=1) return runner.run(main) -(APIServer pid=1) ^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run -(APIServer pid=1) return self._loop.run_until_complete(task) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper -(APIServer pid=1) return await main -(APIServer pid=1) ^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server -(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker -(APIServer pid=1) async with build_async_engine_client( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ -(APIServer pid=1) return await anext(self.gen) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client -(APIServer pid=1) async with build_async_engine_client_from_engine_args( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ -(APIServer pid=1) return await anext(self.gen) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 136, in build_async_engine_client_from_engine_args -(APIServer pid=1) async_llm = AsyncLLM.from_vllm_config( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 225, in from_vllm_config -(APIServer pid=1) return cls( -(APIServer pid=1) ^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 154, in __init__ -(APIServer pid=1) self.engine_core = EngineCoreClient.make_async_mp_client( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(APIServer pid=1) return func(*args, **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 130, in make_async_mp_client -(APIServer pid=1) return AsyncMPClient(*client_args) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(APIServer pid=1) return func(*args, **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 887, in __init__ -(APIServer pid=1) super().__init__( -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 535, in __init__ -(APIServer pid=1) with launch_core_engines( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 144, in __exit__ -(APIServer pid=1) next(self.gen) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 998, in launch_core_engines -(APIServer pid=1) wait_for_engine_startup( -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 1057, in wait_for_engine_startup -(APIServer pid=1) raise RuntimeError( -(APIServer pid=1) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} diff --git a/accuracy/results/v0.19.0/logs/openai-gpt-oss-20b--h100-80gb--tp2pp1dp1--8192.FAILED.log b/accuracy/results/v0.19.0/logs/openai-gpt-oss-20b--h100-80gb--tp2pp1dp1--8192.FAILED.log deleted file mode 100644 index 6ee6d6d3..00000000 --- a/accuracy/results/v0.19.0/logs/openai-gpt-oss-20b--h100-80gb--tp2pp1dp1--8192.FAILED.log +++ /dev/null @@ -1,2104 +0,0 @@ -DEBUG 04-22 19:52:48 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 19:52:48 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 19:52:48 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 19:52:48 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 19:52:48 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 19:52:48 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 19:52:48 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 19:52:48 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 19:52:48 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 19:52:48 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 19:52:48 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 19:52:48 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 19:52:52 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 19:52:54 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' -DEBUG 04-22 19:52:54 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 19:52:54 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 19:52:54 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 19:52:54 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 19:52:54 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 19:52:54 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 19:52:54 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 19:52:54 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model openai/gpt-oss-20b -(APIServer pid=1) INFO 04-22 19:52:54 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 19:52:54 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 19:52:54 [entrypoints/utils.py:233] non-default args: {'model_tag': 'openai/gpt-oss-20b', 'model': 'openai/gpt-oss-20b', 'max_model_len': 8192, 'tensor_parallel_size': 2, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 19:52:54 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 19:52:55 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.gpt_oss.GptOssForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 19:52:55 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0027820 secs -(APIServer pid=1) INFO 04-22 19:52:55 [config/model.py:549] Resolved architecture: GptOssForCausalLM -(APIServer pid=1) Parse safetensors files: 0%| | 0/3 [00:00:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(APIServer pid=1) DEBUG 04-22 19:52:57 [config/vllm.py:1530] Max num batched tokens below allreduce-rms fusion threshold, allreduce-rms fusion will be enabled for all num_tokens. -(APIServer pid=1) INFO 04-22 19:52:57 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(APIServer pid=1) DEBUG 04-22 19:52:57 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 19:52:57 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 19:52:58 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 19:52:58 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 19:53:01 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 19:53:01 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 19:53:01 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 19:53:01 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 19:53:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 19:53:01 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 19:53:01 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 19:53:01 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 19:53:01 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 19:53:01 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 19:53:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 19:53:01 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 19:53:06 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(APIServer pid=1) DEBUG 04-22 19:53:08 [v1/engine/utils.py:1042] Waiting for 1 local, 0 remote core engine proc(s) to connect. -(EngineCore pid=250) DEBUG 04-22 19:53:08 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 19:53:08 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=250) DEBUG 04-22 19:53:08 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/78b56809-ee37-459c-aba1-e0f72246333f'], outputs=['ipc:///tmp/9f7c989d-ac51-4563-bcdd-5b5a4d6ac892'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=250) DEBUG 04-22 19:53:08 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=250) DEBUG 04-22 19:53:08 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=250) DEBUG 04-22 19:53:08 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=250) DEBUG 04-22 19:53:08 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=250) DEBUG 04-22 19:53:08 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=250) INFO 04-22 19:53:08 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='openai/gpt-oss-20b', speculative_config=None, tokenizer='openai/gpt-oss-20b', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=mxfp4, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='openai_gptoss', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=openai/gpt-oss-20b, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 1024, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=250) WARNING 04-22 19:53:08 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. -(EngineCore pid=250) INFO 04-22 19:53:08 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.130.3.158 (local), world_size=2, local_world_size=2 -(EngineCore pid=250) DEBUG 04-22 19:53:08 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/f9dc1f06-7f86-4f82-ac37-a14d48e38fc0 -(EngineCore pid=250) DEBUG 04-22 19:53:08 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1], buffer_handle=(2, 16777216, 10, 'psm_da7bfba3'), local_subscribe_addr='ipc:///tmp/f9dc1f06-7f86-4f82-ac37-a14d48e38fc0', local_notify_addr='ipc:///tmp/f2478e9c-eaaa-4243-b0e4-a20bc65277fc', remote_subscribe_addr=None, remote_addr_ipv6=False) -DEBUG 04-22 19:53:11 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 19:53:11 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 19:53:11 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 19:53:11 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 19:53:11 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 19:53:11 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 19:53:11 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 19:53:11 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 19:53:11 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 19:53:11 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 19:53:11 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 19:53:11 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 19:53:12 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 19:53:12 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 19:53:12 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 19:53:12 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 19:53:12 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 19:53:12 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 19:53:12 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 19:53:12 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 19:53:12 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 19:53:12 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 19:53:12 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 19:53:12 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 19:53:16 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 19:53:16 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 19:53:18 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 19:53:18 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 19:53:18 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 19:53:18 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 19:53:18 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 19:53:18 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 19:53:18 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 19:53:18 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) DEBUG 04-22 19:53:18 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker pid=449) DEBUG 04-22 19:53:18 [distributed/parallel_state.py:1356] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:42747 backend=nccl -(Worker pid=449) INFO 04-22 19:53:18 [distributed/parallel_state.py:1400] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:42747 backend=nccl -(Worker pid=450) DEBUG 04-22 19:53:18 [distributed/parallel_state.py:1356] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:42747 backend=nccl -(Worker pid=450) INFO 04-22 19:53:18 [distributed/parallel_state.py:1400] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:42747 backend=nccl -[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -(Worker pid=450) DEBUG 04-22 19:53:18 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=449) DEBUG 04-22 19:53:18 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -(Worker pid=450) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=450) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=449) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=449) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=449) DEBUG 04-22 19:53:19 [utils/nccl.py:34] Found nccl from library libnccl.so.2 -(Worker pid=449) INFO 04-22 19:53:19 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 -(Worker pid=450) DEBUG 04-22 19:53:19 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=449) DEBUG 04-22 19:53:19 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=449) DEBUG 04-22 19:53:19 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/77daf599-eb59-429d-a2a7-22371debd17a -(Worker pid=449) DEBUG 04-22 19:53:19 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1], buffer_handle=(1, 4194304, 6, 'psm_f1f3ea6b'), local_subscribe_addr='ipc:///tmp/77daf599-eb59-429d-a2a7-22371debd17a', local_notify_addr='ipc:///tmp/a35154c1-e83c-418a-8993-92e9baeaa6ef', remote_subscribe_addr=None, remote_addr_ipv6=False) -(Worker pid=450) DEBUG 04-22 19:53:19 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/77daf599-eb59-429d-a2a7-22371debd17a -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -(Worker pid=449) INFO 04-22 19:53:20 [distributed/parallel_state.py:1716] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N/A -(Worker pid=450) DEBUG 04-22 19:53:20 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.18GiB, total_memory=79.19GiB, cuda_memory=2.01GiB, torch_memory=0.02GiB, non_torch_memory=1.99GiB, timestamp=1776887600.382866, auto_measure=True -(Worker pid=450) DEBUG 04-22 19:53:20 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=449) DEBUG 04-22 19:53:20 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.18GiB, total_memory=79.19GiB, cuda_memory=2.01GiB, torch_memory=0.02GiB, non_torch_memory=1.99GiB, timestamp=1776887600.4114015, auto_measure=True -(Worker pid=449) DEBUG 04-22 19:53:20 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=450) DEBUG 04-22 19:53:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=450) DEBUG 04-22 19:53:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=450) DEBUG 04-22 19:53:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=449) DEBUG 04-22 19:53:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=449) DEBUG 04-22 19:53:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=449) DEBUG 04-22 19:53:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=450) DEBUG 04-22 19:53:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=450) DEBUG 04-22 19:53:20 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=449) DEBUG 04-22 19:53:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=449) DEBUG 04-22 19:53:20 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(Worker pid=449) DEBUG 04-22 19:53:20 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=449) DEBUG 04-22 19:53:20 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(Worker pid=449) DEBUG 04-22 19:53:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker_TP0 pid=449) INFO 04-22 19:53:20 [v1/worker/gpu_model_runner.py:4735] Starting to load model openai/gpt-oss-20b... -(Worker_TP1 pid=450) DEBUG 04-22 19:53:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker_TP0 pid=449) DEBUG 04-22 19:53:20 [model_executor/.../quantization/mxfp4.py:75] MXFP4 linear layer is not implemented - falling back to UnquantizedLinearMethod. -(Worker_TP0 pid=449) DEBUG 04-22 19:53:20 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=64, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=True, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {FLASHINFER: [attention sinks not supported], FLEX_ATTENTION: [attention sinks not supported]}. -(Worker_TP0 pid=449) INFO 04-22 19:53:20 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'TRITON_ATTN']. -(Worker_TP0 pid=449) INFO 04-22 19:53:20 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(Worker_TP0 pid=449) DEBUG 04-22 19:53:20 [model_executor/.../quantization/mxfp4.py:84] MXFP4 attention layer is not implemented. Skipping quantization for this layer. -(Worker_TP0 pid=449) DEBUG 04-22 19:53:20 [model_executor/.../oracle/mxfp4.py:355] Mxfp4 MoE backend 'FLASHINFER_TRTLLM_MXFP4_BF16' does not support the deployment configuration since kernel does not support current device cuda. -(Worker_TP0 pid=449) DEBUG 04-22 19:53:20 [model_executor/.../oracle/mxfp4.py:355] Mxfp4 MoE backend 'CK' does not support the deployment configuration since kernel does not support current device cuda. -(Worker_TP0 pid=449) INFO 04-22 19:53:20 [model_executor/.../oracle/mxfp4.py:352] Using 'TRITON' Mxfp4 MoE backend. -(Worker_TP0 pid=449) DEBUG 04-22 19:53:20 [model_executor/.../runner/default_moe_runner.py:240] Enabled separate cuda stream for MoE shared_experts -(Worker_TP1 pid=450) DEBUG 04-22 19:53:20 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP1 pid=450) DEBUG 04-22 19:53:21 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP1 pid=450) DEBUG 04-22 19:53:21 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 49, 'fused_moe': 24, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP1 pid=450) DEBUG 04-22 19:53:21 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP0 pid=449) DEBUG 04-22 19:53:21 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP0 pid=449) DEBUG 04-22 19:53:21 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP0 pid=449) DEBUG 04-22 19:53:21 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 49, 'fused_moe': 24, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP0 pid=449) DEBUG 04-22 19:53:21 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP0 pid=449) DEBUG 04-22 19:53:21 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00001-of-00002.safetensors', 'model-00002-of-00002.safetensors', 'model-00000-of-00002.safetensors']] -(Worker_TP1 pid=450) DEBUG 04-22 19:53:21 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00000-of-00002.safetensors', 'model-00001-of-00002.safetensors', 'model-00002-of-00002.safetensors']] -(Worker_TP0 pid=449) Loading safetensors checkpoint shards: 0% Completed | 0/3 [00:00 -(Worker_TP1 pid=450) DEBUG 04-22 19:53:34 [compilation/decorators.py:528] Start compiling function -(EngineCore pid=250) DEBUG 04-22 19:53:34 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/forward_context.py -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/config.py -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/modular_kernel.py -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/prepare_finalize/no_dp_ep.py -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/router/gate_linear.py -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/mxfp4.py -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/gpt_oss.py -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=98b6f8a08d comp=e546579c48 code=3e1e8df059eb2a59571f3775ec43375257349289804d84b696e7e8ef247e8675 dir=/data/.cache/vllm/torch_compile_cache/c31ab6d213/rank_1_0/backbone -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP1 pid=450) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] Vllm config hash: 98b6f8a08d -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/forward_context.py -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/config.py -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/modular_kernel.py -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/prepare_finalize/no_dp_ep.py -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/router/gate_linear.py -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/mxfp4.py -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/gpt_oss.py -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP0 pid=449) INFO 04-22 19:53:37 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/c31ab6d213/rank_0_0/backbone for vLLM's torch.compile -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=98b6f8a08d comp=e546579c48 code=3e1e8df059eb2a59571f3775ec43375257349289804d84b696e7e8ef247e8675 dir=/data/.cache/vllm/torch_compile_cache/c31ab6d213/rank_0_0/backbone -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/backends.py:1074] Vllm config hash: 98b6f8a08d -(Worker_TP0 pid=449) INFO 04-22 19:53:37 [compilation/backends.py:1111] Dynamo bytecode transform time: 3.80 s -(Worker_TP0 pid=449) DEBUG 04-22 19:53:37 [compilation/.../fusion/allreduce_rms_fusion.py:765] Flashinfer max size: 64 MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: 8192 -(Worker_TP0 pid=449) INFO 04-22 19:53:37 [distributed/device_communicators/flashinfer_all_reduce.py:109] Auto-selected flashinfer allreduce backend: trtllm -(Worker_TP0 pid=449) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. -(Worker_TP0 pid=449) return func(*args, **kwargs) -(Worker_TP0 pid=449) DEBUG 04-22 19:53:38 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=0, max_token_num=8192, hidden_dim=2880, dtype=torch.bfloat16 -(Worker_TP1 pid=450) DEBUG 04-22 19:53:38 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=1, max_token_num=8192, hidden_dim=2880, dtype=torch.bfloat16 -(Worker_TP0 pid=449) INFO 04-22 19:53:38 [distributed/device_communicators/flashinfer_all_reduce.py:149] Initialized FlashInfer Allreduce norm fusion workspace with backend=trtllm -(APIServer pid=1) DEBUG 04-22 19:53:38 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=449) DEBUG 04-22 19:53:38 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(Worker_TP0 pid=449) DEBUG 04-22 19:53:38 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(Worker_TP1 pid=450) DEBUG 04-22 19:53:39 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=450) DEBUG 04-22 19:53:39 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP0 pid=449) DEBUG 04-22 19:53:39 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=449) DEBUG 04-22 19:53:39 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP1 pid=450) DEBUG 04-22 19:53:39 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns -(Worker_TP1 pid=450) DEBUG 04-22 19:53:39 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 24.6 ms -(Worker_TP1 pid=450) DEBUG 04-22 19:53:39 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=450) DEBUG 04-22 19:53:39 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes -(Worker_TP1 pid=450) DEBUG 04-22 19:53:39 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=449) DEBUG 04-22 19:53:39 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns -(Worker_TP0 pid=449) DEBUG 04-22 19:53:39 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 23.6 ms -(Worker_TP0 pid=449) DEBUG 04-22 19:53:39 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=449) DEBUG 04-22 19:53:39 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes -(Worker_TP0 pid=449) DEBUG 04-22 19:53:39 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=449) INFO 04-22 19:53:41 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(Worker_TP0 pid=449) DEBUG 04-22 19:53:41 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/c31ab6d213/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(Worker_TP1 pid=450) DEBUG 04-22 19:53:41 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=450) DEBUG 04-22 19:53:41 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP0 pid=449) DEBUG 04-22 19:53:41 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=449) DEBUG 04-22 19:53:41 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP1 pid=450) DEBUG 04-22 19:53:41 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP1 pid=450) DEBUG 04-22 19:53:41 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 38.7 ms -(Worker_TP1 pid=450) DEBUG 04-22 19:53:41 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=450) DEBUG 04-22 19:53:41 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP1 pid=450) DEBUG 04-22 19:53:41 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP0 pid=449) DEBUG 04-22 19:53:42 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP0 pid=449) DEBUG 04-22 19:53:42 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.0 ms -(Worker_TP0 pid=449) DEBUG 04-22 19:53:42 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=449) DEBUG 04-22 19:53:42 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP0 pid=449) DEBUG 04-22 19:53:42 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP0 pid=449) DEBUG 04-22 19:53:42 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/c31ab6d213/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(Worker_TP1 pid=450) DEBUG 04-22 19:53:43 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=450) DEBUG 04-22 19:53:43 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP1 pid=450) DEBUG 04-22 19:53:43 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP1 pid=450) DEBUG 04-22 19:53:43 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 39.4 ms -(Worker_TP1 pid=450) DEBUG 04-22 19:53:43 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms -(Worker_TP1 pid=450) DEBUG 04-22 19:53:43 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP1 pid=450) DEBUG 04-22 19:53:43 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=449) DEBUG 04-22 19:53:43 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=449) DEBUG 04-22 19:53:43 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(Worker_TP0 pid=449) DEBUG 04-22 19:53:43 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP0 pid=449) DEBUG 04-22 19:53:43 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 37.5 ms -(Worker_TP0 pid=449) DEBUG 04-22 19:53:43 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms -(Worker_TP0 pid=449) DEBUG 04-22 19:53:43 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP0 pid=449) DEBUG 04-22 19:53:43 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP0 pid=449) DEBUG 04-22 19:53:44 [compilation/backends.py:377] Store the 24-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_24', '/data/.cache/vllm/torch_compile_cache/c31ab6d213/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_24') -(Worker_TP0 pid=449) INFO 04-22 19:53:44 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.52 s -(Worker_TP0 pid=449) DEBUG 04-22 19:53:44 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/c31ab6d213/rank_0_0/backbone/computation_graph.py -(Worker_TP0 pid=449) INFO 04-22 19:53:44 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/12de341f13df4ddc331114846a3ecb4479ec6abe2ec0ad2d0a2165f5ac81498e/rank_0_0/model -(Worker_TP0 pid=449) INFO 04-22 19:53:44 [compilation/monitor.py:48] torch.compile took 10.59 s in total -(APIServer pid=1) DEBUG 04-22 19:53:48 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=449) INFO 04-22 19:53:49 [compilation/monitor.py:76] Initial profiling/warmup run took 4.67 s -(Worker_TP1 pid=450) INFO 04-22 19:53:54 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=1024 -(Worker_TP1 pid=450) DEBUG 04-22 19:53:54 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP1 pid=450) INFO 04-22 19:53:54 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=83 (largest=1024), FULL=83 (largest=1024) -(Worker_TP0 pid=449) INFO 04-22 19:53:55 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=1024 -(Worker_TP0 pid=449) DEBUG 04-22 19:53:55 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP0 pid=449) INFO 04-22 19:53:55 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=83 (largest=1024), FULL=83 (largest=1024) -(Worker_TP1 pid=450) DEBUG 04-22 19:53:55 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=449) DEBUG 04-22 19:53:55 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=449) DEBUG 04-22 19:53:55 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=449) DEBUG 04-22 19:53:55 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=1024, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=450) DEBUG 04-22 19:53:55 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=450) DEBUG 04-22 19:53:55 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=1024, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=449) DEBUG 04-22 19:53:55 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=450) DEBUG 04-22 19:53:55 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=449) DEBUG 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=449) DEBUG 04-22 19:53:56 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=1008, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=450) DEBUG 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=450) DEBUG 04-22 19:53:56 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=1008, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=449) DEBUG 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 128.00 MiB first-capture + (83-1) × 6.00 MiB per-graph -(Worker_TP0 pid=449) DEBUG 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=450) DEBUG 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 128.00 MiB first-capture + (83-1) × 6.00 MiB per-graph -(Worker_TP1 pid=450) DEBUG 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=449) DEBUG 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=449) DEBUG 04-22 19:53:56 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=1024, num_reqs=1024, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=450) DEBUG 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=450) DEBUG 04-22 19:53:56 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=1024, num_reqs=1024, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=449) DEBUG 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=450) DEBUG 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=449) DEBUG 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=449) DEBUG 04-22 19:53:56 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=1008, num_reqs=1008, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=450) DEBUG 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=450) DEBUG 04-22 19:53:56 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=1008, num_reqs=1008, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=449) DEBUG 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 8.00 MiB first-capture + (83-1) × 6.00 MiB per-graph -(Worker_TP0 pid=449) INFO 04-22 19:53:56 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses -(Worker_TP1 pid=450) DEBUG 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 8.00 MiB first-capture + (83-1) × 6.00 MiB per-graph -(Worker_TP1 pid=450) INFO 04-22 19:53:56 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses -(Worker_TP0 pid=449) DEBUG 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP0 pid=449) INFO 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.09 GiB total -(Worker_TP1 pid=450) DEBUG 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP1 pid=450) INFO 04-22 19:53:56 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.09 GiB total -(Worker_TP0 pid=449) DEBUG 04-22 19:53:57 [v1/worker/gpu_worker.py:424] Initial free memory: 77.18 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP0 pid=449) DEBUG 04-22 19:53:57 [v1/worker/gpu_worker.py:430] Free memory after profiling: 66.27 GiB (total), 64.32 GiB (within requested) -(Worker_TP0 pid=449) DEBUG 04-22 19:53:57 [v1/worker/gpu_worker.py:435] Memory profiling takes 23.01 seconds. Total non KV cache memory: 11.95GiB; torch peak memory increase: 2.87GiB; non-torch forward increase memory: 2.07GiB; weights memory: 7.01GiB. -(Worker_TP0 pid=449) INFO 04-22 19:53:57 [v1/worker/gpu_worker.py:436] Available KV cache memory: 63.28 GiB -(Worker_TP0 pid=449) INFO 04-22 19:53:57 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9637 to maintain the same effective KV cache size. -(Worker_TP0 pid=449) DEBUG 04-22 19:53:57 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=250) DEBUG 04-22 19:53:57 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=250) DEBUG 04-22 19:53:57 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=450) DEBUG 04-22 19:53:57 [v1/worker/gpu_worker.py:424] Initial free memory: 77.18 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP1 pid=450) DEBUG 04-22 19:53:57 [v1/worker/gpu_worker.py:430] Free memory after profiling: 66.27 GiB (total), 64.32 GiB (within requested) -(Worker_TP1 pid=450) DEBUG 04-22 19:53:57 [v1/worker/gpu_worker.py:435] Memory profiling takes 23.06 seconds. Total non KV cache memory: 11.95GiB; torch peak memory increase: 2.87GiB; non-torch forward increase memory: 2.07GiB; weights memory: 7.01GiB. -(Worker_TP1 pid=450) INFO 04-22 19:53:57 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9637 to maintain the same effective KV cache size. -(Worker_TP1 pid=450) DEBUG 04-22 19:53:57 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=250) DEBUG 04-22 19:53:57 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=250) INFO 04-22 19:53:57 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 2,764,768 tokens -(EngineCore pid=250) INFO 04-22 19:53:57 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 337.17x -(Worker_TP0 pid=449) DEBUG 04-22 19:53:57 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=450) DEBUG 04-22 19:53:57 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=450) 2026-04-22 19:53:57,277 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP0 pid=449) 2026-04-22 19:53:57,277 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP0 pid=449) DEBUG 04-22 19:53:57 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=450) DEBUG 04-22 19:53:57 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=450) 2026-04-22 19:53:57,319 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP0 pid=449) 2026-04-22 19:53:57,320 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP0 pid=449) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/83 [00:00 -(APIServer pid=1) sys.exit(main()) -(APIServer pid=1) ^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main -(APIServer pid=1) args.dispatch_function(args) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd -(APIServer pid=1) uvloop.run(run_server(args)) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run -(APIServer pid=1) return __asyncio.run( -(APIServer pid=1) ^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run -(APIServer pid=1) return runner.run(main) -(APIServer pid=1) ^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run -(APIServer pid=1) return self._loop.run_until_complete(task) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper -(APIServer pid=1) return await main -(APIServer pid=1) ^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server -(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker -(APIServer pid=1) async with build_async_engine_client( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ -(APIServer pid=1) return await anext(self.gen) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client -(APIServer pid=1) async with build_async_engine_client_from_engine_args( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ -(APIServer pid=1) return await anext(self.gen) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 136, in build_async_engine_client_from_engine_args -(APIServer pid=1) async_llm = AsyncLLM.from_vllm_config( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 225, in from_vllm_config -(APIServer pid=1) return cls( -(APIServer pid=1) ^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 154, in __init__ -(APIServer pid=1) self.engine_core = EngineCoreClient.make_async_mp_client( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(APIServer pid=1) return func(*args, **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 130, in make_async_mp_client -(APIServer pid=1) return AsyncMPClient(*client_args) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(APIServer pid=1) return func(*args, **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 887, in __init__ -(APIServer pid=1) super().__init__( -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 535, in __init__ -(APIServer pid=1) with launch_core_engines( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 144, in __exit__ -(APIServer pid=1) next(self.gen) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 998, in launch_core_engines -(APIServer pid=1) wait_for_engine_startup( -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 1057, in wait_for_engine_startup -(APIServer pid=1) raise RuntimeError( -(APIServer pid=1) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} -/usr/lib/python3.12/multiprocessing/resource_tracker.py:279: UserWarning: resource_tracker: There appear to be 2 leaked shared_memory objects to clean up at shutdown - warnings.warn('resource_tracker: There appear to be %d ' diff --git a/accuracy/results/v0.19.0/logs/openai-gpt-oss-20b--h100-80gb--tp2pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/openai-gpt-oss-20b--h100-80gb--tp2pp1dp1--8192.log deleted file mode 100644 index c1d2580f..00000000 --- a/accuracy/results/v0.19.0/logs/openai-gpt-oss-20b--h100-80gb--tp2pp1dp1--8192.log +++ /dev/null @@ -1,1855 +0,0 @@ -DEBUG 04-22 19:59:01 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 19:59:01 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 19:59:01 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 19:59:01 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 19:59:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 19:59:01 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 19:59:01 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 19:59:01 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 19:59:01 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 19:59:01 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 19:59:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 19:59:01 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 19:59:06 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 19:59:08 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' -DEBUG 04-22 19:59:08 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 19:59:08 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 19:59:08 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 19:59:08 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 19:59:08 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 19:59:08 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 19:59:08 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 19:59:08 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model openai/gpt-oss-20b -(APIServer pid=1) INFO 04-22 19:59:08 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 19:59:08 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 19:59:08 [entrypoints/utils.py:233] non-default args: {'model_tag': 'openai/gpt-oss-20b', 'model': 'openai/gpt-oss-20b', 'max_model_len': 8192, 'tensor_parallel_size': 2, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 19:59:08 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 19:59:08 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.gpt_oss.GptOssForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 19:59:08 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0032642 secs -(APIServer pid=1) INFO 04-22 19:59:08 [config/model.py:549] Resolved architecture: GptOssForCausalLM -(APIServer pid=1) Parse safetensors files: 0%| | 0/3 [00:00:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(APIServer pid=1) DEBUG 04-22 19:59:10 [config/vllm.py:1530] Max num batched tokens below allreduce-rms fusion threshold, allreduce-rms fusion will be enabled for all num_tokens. -(APIServer pid=1) INFO 04-22 19:59:10 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(APIServer pid=1) DEBUG 04-22 19:59:10 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 19:59:10 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 19:59:11 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 19:59:11 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 19:59:15 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 19:59:15 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 19:59:15 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 19:59:15 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 19:59:15 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 19:59:15 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 19:59:15 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 19:59:15 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 19:59:15 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 19:59:15 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 19:59:15 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 19:59:15 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 19:59:19 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=245) DEBUG 04-22 19:59:21 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 19:59:21 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=245) DEBUG 04-22 19:59:21 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/d3d31af3-ecd1-47c6-9854-1df8e53c0ac1'], outputs=['ipc:///tmp/4eaa8040-fdd7-47d7-a5e1-3383bf7c5ce1'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=245) DEBUG 04-22 19:59:21 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=245) DEBUG 04-22 19:59:21 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=245) DEBUG 04-22 19:59:21 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=245) DEBUG 04-22 19:59:21 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=245) DEBUG 04-22 19:59:21 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=245) INFO 04-22 19:59:21 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='openai/gpt-oss-20b', speculative_config=None, tokenizer='openai/gpt-oss-20b', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=mxfp4, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='openai_gptoss', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=openai/gpt-oss-20b, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640, 656, 672, 688, 704, 720, 736, 752, 768, 784, 800, 816, 832, 848, 864, 880, 896, 912, 928, 944, 960, 976, 992, 1008, 1024], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 1024, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=245) WARNING 04-22 19:59:21 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. -(EngineCore pid=245) INFO 04-22 19:59:21 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.128.11.14 (local), world_size=2, local_world_size=2 -(EngineCore pid=245) DEBUG 04-22 19:59:21 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/a2eb9f62-781b-4250-9ef5-a396a73de42c -(EngineCore pid=245) DEBUG 04-22 19:59:21 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1], buffer_handle=(2, 16777216, 10, 'psm_800509af'), local_subscribe_addr='ipc:///tmp/a2eb9f62-781b-4250-9ef5-a396a73de42c', local_notify_addr='ipc:///tmp/ab131fb1-009e-482c-93dd-b205df334824', remote_subscribe_addr=None, remote_addr_ipv6=False) -DEBUG 04-22 19:59:24 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 19:59:24 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 19:59:24 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 19:59:24 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 19:59:24 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 19:59:24 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 19:59:24 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 19:59:24 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 19:59:24 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 19:59:24 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 19:59:24 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 19:59:24 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 19:59:24 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 19:59:24 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 19:59:24 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 19:59:24 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 19:59:24 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 19:59:24 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 19:59:24 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 19:59:24 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 19:59:24 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 19:59:24 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 19:59:24 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 19:59:24 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 19:59:29 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 19:59:29 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 19:59:30 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 19:59:30 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 19:59:30 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 19:59:30 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 19:59:30 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 19:59:30 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 19:59:30 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 19:59:30 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(Worker pid=444) DEBUG 04-22 19:59:31 [distributed/parallel_state.py:1356] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:34811 backend=nccl -(Worker pid=444) INFO 04-22 19:59:31 [distributed/parallel_state.py:1400] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:34811 backend=nccl -(APIServer pid=1) DEBUG 04-22 19:59:31 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker pid=445) DEBUG 04-22 19:59:31 [distributed/parallel_state.py:1356] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:34811 backend=nccl -(Worker pid=445) INFO 04-22 19:59:31 [distributed/parallel_state.py:1400] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:34811 backend=nccl -[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -(Worker pid=444) DEBUG 04-22 19:59:31 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=445) DEBUG 04-22 19:59:31 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -(Worker pid=445) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=445) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=444) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=444) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=444) DEBUG 04-22 19:59:31 [utils/nccl.py:34] Found nccl from library libnccl.so.2 -(Worker pid=444) INFO 04-22 19:59:31 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 -(Worker pid=445) DEBUG 04-22 19:59:32 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=444) DEBUG 04-22 19:59:32 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=444) DEBUG 04-22 19:59:32 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/3a64d419-020b-415e-9781-8f6479f087fc -(Worker pid=444) DEBUG 04-22 19:59:32 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1], buffer_handle=(1, 4194304, 6, 'psm_f3c4eead'), local_subscribe_addr='ipc:///tmp/3a64d419-020b-415e-9781-8f6479f087fc', local_notify_addr='ipc:///tmp/5abf5589-5384-4424-a5e8-51f14f549c6d', remote_subscribe_addr=None, remote_addr_ipv6=False) -(Worker pid=445) DEBUG 04-22 19:59:32 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/3a64d419-020b-415e-9781-8f6479f087fc -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -(Worker pid=444) INFO 04-22 19:59:32 [distributed/parallel_state.py:1716] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N/A -(Worker pid=444) DEBUG 04-22 19:59:32 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.18GiB, total_memory=79.19GiB, cuda_memory=2.01GiB, torch_memory=0.02GiB, non_torch_memory=1.99GiB, timestamp=1776887972.6513388, auto_measure=True -(Worker pid=444) DEBUG 04-22 19:59:32 [v1/worker/gpu_worker.py:285] worker requested memory: 71.27GiB -(Worker pid=445) DEBUG 04-22 19:59:32 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.18GiB, total_memory=79.19GiB, cuda_memory=2.01GiB, torch_memory=0.02GiB, non_torch_memory=1.99GiB, timestamp=1776887972.7254164, auto_measure=True -(Worker pid=445) DEBUG 04-22 19:59:32 [v1/worker/gpu_worker.py:285] worker requested memory: 71.27GiB -(Worker pid=444) DEBUG 04-22 19:59:32 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=444) DEBUG 04-22 19:59:32 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=444) DEBUG 04-22 19:59:32 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=444) DEBUG 04-22 19:59:32 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=444) DEBUG 04-22 19:59:32 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(Worker pid=444) DEBUG 04-22 19:59:32 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=445) DEBUG 04-22 19:59:32 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=445) DEBUG 04-22 19:59:32 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=445) DEBUG 04-22 19:59:32 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=444) DEBUG 04-22 19:59:32 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(Worker pid=444) DEBUG 04-22 19:59:32 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker_TP0 pid=444) INFO 04-22 19:59:32 [v1/worker/gpu_model_runner.py:4735] Starting to load model openai/gpt-oss-20b... -(Worker pid=445) DEBUG 04-22 19:59:32 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=445) DEBUG 04-22 19:59:32 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker_TP0 pid=444) DEBUG 04-22 19:59:33 [model_executor/.../quantization/mxfp4.py:75] MXFP4 linear layer is not implemented - falling back to UnquantizedLinearMethod. -(Worker_TP1 pid=445) DEBUG 04-22 19:59:33 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker_TP0 pid=444) DEBUG 04-22 19:59:33 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=64, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=True, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {FLASHINFER: [attention sinks not supported], FLEX_ATTENTION: [attention sinks not supported]}. -(Worker_TP0 pid=444) INFO 04-22 19:59:33 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'TRITON_ATTN']. -(Worker_TP0 pid=444) INFO 04-22 19:59:33 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(Worker_TP0 pid=444) DEBUG 04-22 19:59:33 [model_executor/.../quantization/mxfp4.py:84] MXFP4 attention layer is not implemented. Skipping quantization for this layer. -(Worker_TP0 pid=444) DEBUG 04-22 19:59:33 [model_executor/.../oracle/mxfp4.py:355] Mxfp4 MoE backend 'FLASHINFER_TRTLLM_MXFP4_BF16' does not support the deployment configuration since kernel does not support current device cuda. -(Worker_TP0 pid=444) DEBUG 04-22 19:59:33 [model_executor/.../oracle/mxfp4.py:355] Mxfp4 MoE backend 'CK' does not support the deployment configuration since kernel does not support current device cuda. -(Worker_TP0 pid=444) INFO 04-22 19:59:33 [model_executor/.../oracle/mxfp4.py:352] Using 'TRITON' Mxfp4 MoE backend. -(Worker_TP0 pid=444) DEBUG 04-22 19:59:33 [model_executor/.../runner/default_moe_runner.py:240] Enabled separate cuda stream for MoE shared_experts -(Worker_TP0 pid=444) DEBUG 04-22 19:59:33 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP0 pid=444) DEBUG 04-22 19:59:33 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP0 pid=444) DEBUG 04-22 19:59:33 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 49, 'fused_moe': 24, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP0 pid=444) DEBUG 04-22 19:59:33 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP1 pid=445) DEBUG 04-22 19:59:33 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP1 pid=445) DEBUG 04-22 19:59:33 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP1 pid=445) DEBUG 04-22 19:59:33 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 49, 'fused_moe': 24, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP1 pid=445) DEBUG 04-22 19:59:33 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP0 pid=444) DEBUG 04-22 19:59:33 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00000-of-00002.safetensors', 'model-00001-of-00002.safetensors', 'model-00002-of-00002.safetensors']] -(Worker_TP1 pid=445) DEBUG 04-22 19:59:33 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00001-of-00002.safetensors', 'model-00002-of-00002.safetensors', 'model-00000-of-00002.safetensors']] -(Worker_TP0 pid=444) Loading safetensors checkpoint shards: 0% Completed | 0/3 [00:00:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=245) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=245) DEBUG 04-22 20:00:11 [config/vllm.py:1530] Max num batched tokens below allreduce-rms fusion threshold, allreduce-rms fusion will be enabled for all num_tokens. -(EngineCore pid=245) INFO 04-22 20:00:11 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(Worker_TP1 pid=445) DEBUG 04-22 20:00:11 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=444) DEBUG 04-22 20:00:11 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=245) DEBUG 04-22 20:00:11 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=245) DEBUG 04-22 20:00:11 [v1/engine/core.py:1158] EngineCore waiting for work. -(EngineCore pid=245) DEBUG 04-22 20:00:11 [v1/engine/core.py:1158] EngineCore waiting for work. -(APIServer pid=1) INFO 04-22 20:00:11 [entrypoints/openai/api_server.py:590] Supported tasks: ['generate'] -(APIServer pid=1) WARNING 04-22 20:00:12 [entrypoints/.../responses/serving.py:233] For gpt-oss, we ignore --enable-auto-tool-choice and always enable tool use. -(Worker_TP0 pid=444) DEBUG 04-22 20:00:12 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=445) DEBUG 04-22 20:00:12 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(APIServer pid=1) DEBUG 04-22 20:00:12 [renderers/base.py:197] Warming up chat template processing... -(APIServer pid=1) DEBUG 04-22 20:00:12 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e928cc-3cd6682951500df46a152659;ecb8632e-5d0c-43fb-9ac4-d6b1de14410e) -(APIServer pid=1) DEBUG 04-22 20:00:12 [transformers_utils/repo_utils.py:243] -(APIServer pid=1) DEBUG 04-22 20:00:12 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/openai/gpt-oss-20b/resolve/main/processor_config.json. -(APIServer pid=1) DEBUG 04-22 20:00:13 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e928cc-1073a5747e1f81b31bf20d64;e3968c71-54c5-4bd5-b2a7-9b55999cfe2a) -(APIServer pid=1) DEBUG 04-22 20:00:13 [transformers_utils/repo_utils.py:243] -(APIServer pid=1) DEBUG 04-22 20:00:13 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/openai/gpt-oss-20b/resolve/main/preprocessor_config.json. -(APIServer pid=1) INFO 04-22 20:00:14 [renderers/hf.py:314] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this. -(APIServer pid=1) DEBUG 04-22 20:00:14 [renderers/base.py:203] Chat template warmup completed in 1.181s -(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/openai/api_server.py:594] Starting vLLM server on http://0.0.0.0:8000 -(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:37] Available routes are: -(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /openapi.json, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /docs, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /docs/oauth2-redirect, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /redoc, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /tokenize, Methods: POST -(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /detokenize, Methods: POST -(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /load, Methods: GET -(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /version, Methods: GET -(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /health, Methods: GET -(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /metrics, Methods: GET -(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /v1/models, Methods: GET -(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /ping, Methods: GET -(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /ping, Methods: POST -(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /invocations, Methods: POST -(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /v1/chat/completions, Methods: POST -(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST -(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /v1/responses, Methods: POST -(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET -(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST -(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /v1/completions, Methods: POST -(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /v1/messages, Methods: POST -(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST -(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /inference/v1/generate, Methods: POST -(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /scale_elastic_ep, Methods: POST -(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST -(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /v1/chat/completions/render, Methods: POST -(APIServer pid=1) INFO 04-22 20:00:14 [entrypoints/launcher.py:46] Route: /v1/completions/render, Methods: POST -(APIServer pid=1) INFO: Started server process [1] -(APIServer pid=1) INFO: Waiting for application startup. -(APIServer pid=1) INFO: Application startup complete. -(APIServer pid=1) DEBUG 04-22 20:00:17 [v1/engine/async_llm.py:875] Called check_health. -(APIServer pid=1) INFO: 10.128.10.2:38542 - "GET /health HTTP/1.1" 200 OK diff --git a/accuracy/results/v0.19.0/logs/qwen-qwen1-5-moe-a2-7b--h100-80gb--tp1pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/qwen-qwen1-5-moe-a2-7b--h100-80gb--tp1pp1dp1--8192.log deleted file mode 100644 index 829c48eb..00000000 --- a/accuracy/results/v0.19.0/logs/qwen-qwen1-5-moe-a2-7b--h100-80gb--tp1pp1dp1--8192.log +++ /dev/null @@ -1,771 +0,0 @@ -DEBUG 04-23 00:59:37 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-23 00:59:37 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-23 00:59:37 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-23 00:59:37 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-23 00:59:37 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-23 00:59:37 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-23 00:59:37 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-23 00:59:37 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-23 00:59:37 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-23 00:59:37 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-23 00:59:37 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-23 00:59:37 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-23 00:59:42 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-23 00:59:44 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' -DEBUG 04-23 00:59:44 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-23 00:59:44 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-23 00:59:44 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-23 00:59:44 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-23 00:59:44 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-23 00:59:44 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-23 00:59:44 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-23 00:59:44 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model Qwen/Qwen1.5-MoE-A2.7B -(APIServer pid=1) INFO 04-23 00:59:44 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-23 00:59:44 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-23 00:59:44 [entrypoints/utils.py:233] non-default args: {'model_tag': 'Qwen/Qwen1.5-MoE-A2.7B', 'model': 'Qwen/Qwen1.5-MoE-A2.7B', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-23 00:59:44 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-23 00:59:45 [model_executor/models/registry.py:774] Cached model info file for class vllm.model_executor.models.qwen2_moe.Qwen2MoeForCausalLM not found -(APIServer pid=1) DEBUG 04-23 00:59:45 [model_executor/models/registry.py:834] Cache model info for class vllm.model_executor.models.qwen2_moe.Qwen2MoeForCausalLM miss. Loading model instead. -(APIServer pid=1) DEBUG 04-23 00:59:54 [model_executor/models/registry.py:844] Loaded model info for class vllm.model_executor.models.qwen2_moe.Qwen2MoeForCausalLM -(APIServer pid=1) DEBUG 04-23 00:59:55 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 9.9489278 secs -(APIServer pid=1) INFO 04-23 00:59:55 [config/model.py:549] Resolved architecture: Qwen2MoeForCausalLM -(APIServer pid=1) INFO 04-23 00:59:55 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-23 00:59:55 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-23 00:59:55 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-23 00:59:55 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-23 00:59:55 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-23 00:59:55 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-23 00:59:55 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-23 00:59:55 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-23 00:59:55 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-23 00:59:55 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-23 00:59:57 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-23 00:59:57 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-23 01:00:01 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-23 01:00:01 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-23 01:00:01 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-23 01:00:01 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-23 01:00:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-23 01:00:01 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-23 01:00:01 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-23 01:00:01 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-23 01:00:01 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-23 01:00:01 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-23 01:00:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-23 01:00:01 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-23 01:00:06 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(APIServer pid=1) DEBUG 04-23 01:00:07 [v1/engine/utils.py:1042] Waiting for 1 local, 0 remote core engine proc(s) to connect. -(EngineCore pid=436) DEBUG 04-23 01:00:07 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-23 01:00:07 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=436) DEBUG 04-23 01:00:07 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/21edced6-b313-45ec-92ff-bf9809b8e7de'], outputs=['ipc:///tmp/db714d47-5168-4d47-9fd1-3221cefdb843'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=436) DEBUG 04-23 01:00:07 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=436) DEBUG 04-23 01:00:07 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=436) DEBUG 04-23 01:00:07 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=436) DEBUG 04-23 01:00:07 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=436) DEBUG 04-23 01:00:07 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=436) INFO 04-23 01:00:07 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='Qwen/Qwen1.5-MoE-A2.7B', speculative_config=None, tokenizer='Qwen/Qwen1.5-MoE-A2.7B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen1.5-MoE-A2.7B, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=436) DEBUG 04-23 01:00:08 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.130.6.25:50059 backend=nccl -(EngineCore pid=436) INFO 04-23 01:00:08 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.130.6.25:50059 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=436) DEBUG 04-23 01:00:08 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=436) INFO 04-23 01:00:08 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N/A -(EngineCore pid=436) DEBUG 04-23 01:00:08 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776906008.6471531, auto_measure=True -(EngineCore pid=436) DEBUG 04-23 01:00:08 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=436) DEBUG 04-23 01:00:08 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=436) DEBUG 04-23 01:00:08 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=436) DEBUG 04-23 01:00:08 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=436) DEBUG 04-23 01:00:08 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=436) DEBUG 04-23 01:00:08 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=436) DEBUG 04-23 01:00:08 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=436) DEBUG 04-23 01:00:08 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=436) DEBUG 04-23 01:00:08 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=436) INFO 04-23 01:00:08 [v1/worker/gpu_model_runner.py:4735] Starting to load model Qwen/Qwen1.5-MoE-A2.7B... -(EngineCore pid=436) DEBUG 04-23 01:00:09 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=436) INFO 04-23 01:00:09 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=436) INFO 04-23 01:00:09 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=436) INFO 04-23 01:00:09 [model_executor/.../oracle/unquantized.py:186] Using TRITON backend for Unquantized MoE -(EngineCore pid=436) DEBUG 04-23 01:00:09 [model_executor/.../runner/default_moe_runner.py:240] Enabled separate cuda stream for MoE shared_experts -(EngineCore pid=436) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=436) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=436) DEBUG 04-23 01:00:09 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=436) DEBUG 04-23 01:00:09 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=436) DEBUG 04-23 01:00:09 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 49, 'silu_and_mul': 24, 'fused_moe': 24, 'unquantized_fused_moe': 24, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=436) DEBUG 04-23 01:00:09 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=436) DEBUG 04-23 01:00:09 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00005-of-00008.safetensors', 'model-00008-of-00008.safetensors', 'model-00003-of-00008.safetensors', 'model-00007-of-00008.safetensors', 'model-00002-of-00008.safetensors', 'model-00001-of-00008.safetensors', 'model-00006-of-00008.safetensors', 'model-00004-of-00008.safetensors']] -(APIServer pid=1) DEBUG 04-23 01:00:17 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 01:00:27 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 01:00:37 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 01:00:47 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 01:00:57 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 01:01:07 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(APIServer pid=1) DEBUG 04-23 01:01:17 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=436) INFO 04-23 01:01:22 [model_executor/model_loader/weight_utils.py:581] Time spent downloading weights for Qwen/Qwen1.5-MoE-A2.7B: 72.140959 seconds -(EngineCore pid=436) Loading safetensors checkpoint shards: 0% Completed | 0/8 [00:00 -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/forward_context.py -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/config.py -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/shared_fused_moe.py -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2_moe.py -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=436) INFO 04-23 01:01:51 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/bf7cc57660/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=155e491f75 comp=e546579c48 code=18cfc43ea7c46403958dd7b7827107c640593b29e9880eadef1653adb7ec84a6 dir=/data/.cache/vllm/torch_compile_cache/bf7cc57660/rank_0_0/backbone -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/backends.py:1074] Vllm config hash: 155e491f75 -(EngineCore pid=436) INFO 04-23 01:01:51 [compilation/backends.py:1111] Dynamo bytecode transform time: 3.42 s -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=436) DEBUG 04-23 01:01:51 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=436) DEBUG 04-23 01:01:52 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=436) DEBUG 04-23 01:01:52 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(EngineCore pid=436) DEBUG 04-23 01:01:52 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms -(EngineCore pid=436) DEBUG 04-23 01:01:52 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=436) DEBUG 04-23 01:01:52 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=436) INFO 04-23 01:01:53 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=436) DEBUG 04-23 01:01:53 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/bf7cc57660/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=436) DEBUG 04-23 01:01:53 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=436) DEBUG 04-23 01:01:53 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(EngineCore pid=436) DEBUG 04-23 01:01:53 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms -(EngineCore pid=436) DEBUG 04-23 01:01:53 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=436) DEBUG 04-23 01:01:53 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=436) DEBUG 04-23 01:01:54 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/bf7cc57660/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=436) DEBUG 04-23 01:01:55 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=436) DEBUG 04-23 01:01:55 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms -(EngineCore pid=436) DEBUG 04-23 01:01:55 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms -(EngineCore pid=436) DEBUG 04-23 01:01:55 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=436) DEBUG 04-23 01:01:55 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(EngineCore pid=436) DEBUG 04-23 01:01:56 [compilation/backends.py:377] Store the 24-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_24', '/data/.cache/vllm/torch_compile_cache/bf7cc57660/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_24') -(EngineCore pid=436) INFO 04-23 01:01:56 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 4.27 s -(EngineCore pid=436) DEBUG 04-23 01:01:56 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/bf7cc57660/rank_0_0/backbone/computation_graph.py -(EngineCore pid=436) INFO 04-23 01:01:57 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/fe26b561311617008fabbb42b46538f4447d96637ecd092d3fde6e2428b1dd0a/rank_0_0/model -(EngineCore pid=436) INFO 04-23 01:01:57 [compilation/monitor.py:48] torch.compile took 8.98 s in total -(APIServer pid=1) DEBUG 04-23 01:01:57 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=436) WARNING 04-23 01:01:57 [model_executor/.../fused_moe/fused_moe.py:1090] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=60,N=1408,device_name=NVIDIA_H100_80GB_HBM3.json -(EngineCore pid=436) INFO 04-23 01:01:58 [compilation/monitor.py:76] Initial profiling/warmup run took 1.65 s -(EngineCore pid=436) INFO 04-23 01:02:04 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=436) DEBUG 04-23 01:02:04 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=436) INFO 04-23 01:02:04 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=436) DEBUG 04-23 01:02:04 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=436) DEBUG 04-23 01:02:05 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=436) DEBUG 04-23 01:02:05 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=436) DEBUG 04-23 01:02:05 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=436) DEBUG 04-23 01:02:05 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=436) DEBUG 04-23 01:02:05 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=436) DEBUG 04-23 01:02:05 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 96.00 MiB first-capture + (51-1) × 8.00 MiB per-graph -(EngineCore pid=436) DEBUG 04-23 01:02:05 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=436) DEBUG 04-23 01:02:05 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=436) DEBUG 04-23 01:02:05 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=436) DEBUG 04-23 01:02:05 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=436) DEBUG 04-23 01:02:05 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=436) DEBUG 04-23 01:02:05 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=436) DEBUG 04-23 01:02:05 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 136.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=436) DEBUG 04-23 01:02:05 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=436) INFO 04-23 01:02:05 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.82 GiB total -(EngineCore pid=436) DEBUG 04-23 01:02:06 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=436) DEBUG 04-23 01:02:06 [v1/worker/gpu_worker.py:430] Free memory after profiling: 50.86 GiB (total), 47.41 GiB (within requested) -(EngineCore pid=436) DEBUG 04-23 01:02:06 [v1/worker/gpu_worker.py:435] Memory profiling takes 17.98 seconds. Total non KV cache memory: 29.4GiB; torch peak memory increase: 2.47GiB; non-torch forward increase memory: 0.25GiB; weights memory: 26.67GiB. -(EngineCore pid=436) INFO 04-23 01:02:06 [v1/worker/gpu_worker.py:436] Available KV cache memory: 45.83 GiB -(EngineCore pid=436) INFO 04-23 01:02:06 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9603 to maintain the same effective KV cache size. -(EngineCore pid=436) INFO 04-23 01:02:06 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 250,304 tokens -(EngineCore pid=436) INFO 04-23 01:02:06 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 30.55x -(EngineCore pid=436) 2026-04-23 01:02:06,159 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=436) DEBUG 04-23 01:02:06 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=436) 2026-04-23 01:02:06,186 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=436) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:25:57 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:25:57 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 00:25:57 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:25:57 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 00:25:57 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 00:25:57 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model Qwen/Qwen2.5-7B-Instruct -(APIServer pid=1) INFO 04-22 00:25:57 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 00:25:57 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:25:57 [entrypoints/utils.py:233] non-default args: {'model_tag': 'Qwen/Qwen2.5-7B-Instruct', 'model': 'Qwen/Qwen2.5-7B-Instruct', 'dtype': 'bfloat16', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'kv_cache_dtype': 'fp8', 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 00:25:57 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 00:25:57 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen2.Qwen2ForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 00:25:57 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0014693 secs -(APIServer pid=1) INFO 04-22 00:25:57 [config/model.py:549] Resolved architecture: Qwen2ForCausalLM -(APIServer pid=1) INFO 04-22 00:25:57 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 00:25:57 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 00:25:57 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 00:25:57 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 00:25:57 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 00:25:57 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 00:25:57 [config/cache.py:227] Using fp8 data type to store kv cache. It reduces the GPU memory footprint and boosts the performance. Meanwhile, it may cause accuracy drop without a proper scaling factor. -(APIServer pid=1) INFO 04-22 00:25:57 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 00:25:57 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 00:25:57 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 00:25:57 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:25:58 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:25:58 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 00:26:01 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:26:01 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:26:01 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:26:01 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:26:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:26:01 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:26:01 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:26:01 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:26:01 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:26:01 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:26:01 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:26:01 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:26:06 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=243) DEBUG 04-22 00:26:08 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 00:26:08 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=243) DEBUG 04-22 00:26:08 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/91f9a72c-0844-4dc5-aadc-a9cca228e026'], outputs=['ipc:///tmp/45be3ca9-00ed-4f3d-90fc-6f4191051c55'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=243) DEBUG 04-22 00:26:08 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=243) DEBUG 04-22 00:26:08 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=243) DEBUG 04-22 00:26:08 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=243) DEBUG 04-22 00:26:08 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=243) DEBUG 04-22 00:26:08 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=243) INFO 04-22 00:26:08 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='Qwen/Qwen2.5-7B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-7B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=fp8, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen2.5-7B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=243) DEBUG 04-22 00:26:08 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.27:47777 backend=nccl -(EngineCore pid=243) INFO 04-22 00:26:08 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.27:47777 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=243) DEBUG 04-22 00:26:08 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=243) INFO 04-22 00:26:08 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=243) DEBUG 04-22 00:26:09 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776817569.088757, auto_measure=True -(EngineCore pid=243) DEBUG 04-22 00:26:09 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=243) DEBUG 04-22 00:26:09 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=243) DEBUG 04-22 00:26:09 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=243) DEBUG 04-22 00:26:09 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=243) DEBUG 04-22 00:26:09 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=243) DEBUG 04-22 00:26:09 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=243) DEBUG 04-22 00:26:09 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=243) DEBUG 04-22 00:26:09 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=243) INFO 04-22 00:26:09 [v1/worker/gpu_model_runner.py:4735] Starting to load model Qwen/Qwen2.5-7B-Instruct... -(EngineCore pid=243) DEBUG 04-22 00:26:09 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=fp8, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {FLEX_ATTENTION: [kv_cache_dtype not supported]}. -(EngineCore pid=243) INFO 04-22 00:26:09 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN']. -(EngineCore pid=243) INFO 04-22 00:26:09 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=243) INFO 04-22 00:26:09 [utils/deep_gemm.py:115] DeepGEMM E8M0 enabled on current platform. -(EngineCore pid=243) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=243) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=243) DEBUG 04-22 00:26:10 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=243) DEBUG 04-22 00:26:10 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=243) DEBUG 04-22 00:26:10 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 57, 'quant_fp8': 28, 'silu_and_mul': 28, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=243) DEBUG 04-22 00:26:10 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=243) DEBUG 04-22 00:26:10 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00004-of-00004.safetensors']] -(EngineCore pid=243) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 -(APIServer pid=1) DEBUG 04-22 00:26:28 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/input_quant_fp8.py -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/utils/quant_utils.py -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=243) INFO 04-22 00:26:28 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/b8336a0fef/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=74c824397d comp=e546579c48 code=0a3e1ce528ff50bb8acbe695e2afa6812d0aec87dbdaec559728313523b0114f dir=/data/.cache/vllm/torch_compile_cache/b8336a0fef/rank_0_0/backbone -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/backends.py:1074] Vllm config hash: 74c824397d -(EngineCore pid=243) INFO 04-22 00:26:28 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.20 s -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=243) DEBUG 04-22 00:26:28 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=243) DEBUG 04-22 00:26:29 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 00:26:29 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms -(EngineCore pid=243) DEBUG 04-22 00:26:29 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms -(EngineCore pid=243) DEBUG 04-22 00:26:29 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 00:26:29 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=243) INFO 04-22 00:26:31 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=243) DEBUG 04-22 00:26:31 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/b8336a0fef/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=243) DEBUG 04-22 00:26:31 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 00:26:31 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(EngineCore pid=243) DEBUG 04-22 00:26:31 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.4 ms -(EngineCore pid=243) DEBUG 04-22 00:26:31 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 00:26:31 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=243) DEBUG 04-22 00:26:33 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/b8336a0fef/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=243) DEBUG 04-22 00:26:33 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 00:26:33 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms -(EngineCore pid=243) DEBUG 04-22 00:26:33 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms -(EngineCore pid=243) DEBUG 04-22 00:26:33 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 00:26:33 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(EngineCore pid=243) DEBUG 04-22 00:26:34 [compilation/backends.py:377] Store the 28-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_28', '/data/.cache/vllm/torch_compile_cache/b8336a0fef/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_28') -(EngineCore pid=243) INFO 04-22 00:26:34 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.43 s -(EngineCore pid=243) DEBUG 04-22 00:26:34 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/b8336a0fef/rank_0_0/backbone/computation_graph.py -(EngineCore pid=243) INFO 04-22 00:26:35 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/867349b15c2e587a1763e2f6fedbb0931bd8753bb77f9ec7ed1cb742e42e0af6/rank_0_0/model -(EngineCore pid=243) INFO 04-22 00:26:35 [compilation/monitor.py:48] torch.compile took 11.06 s in total -(EngineCore pid=243) INFO 04-22 00:26:35 [compilation/monitor.py:76] Initial profiling/warmup run took 0.43 s -(APIServer pid=1) DEBUG 04-22 00:26:38 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=243) INFO 04-22 00:26:41 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=243) DEBUG 04-22 00:26:41 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=243) INFO 04-22 00:26:41 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=243) DEBUG 04-22 00:26:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:26:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:26:41 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 00:26:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:26:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:26:41 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 00:26:41 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 160.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(EngineCore pid=243) DEBUG 04-22 00:26:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:26:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:26:41 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 00:26:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:26:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:26:41 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 00:26:41 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 228.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(EngineCore pid=243) DEBUG 04-22 00:26:42 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=243) INFO 04-22 00:26:42 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.61 GiB total -(EngineCore pid=243) DEBUG 04-22 00:26:42 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=243) DEBUG 04-22 00:26:42 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.98 GiB (total), 59.53 GiB (within requested) -(EngineCore pid=243) DEBUG 04-22 00:26:42 [v1/worker/gpu_worker.py:435] Memory profiling takes 18.20 seconds. Total non KV cache memory: 16.7GiB; torch peak memory increase: 2.21GiB; non-torch forward increase memory: 0.24GiB; weights memory: 14.25GiB. -(EngineCore pid=243) INFO 04-22 00:26:42 [v1/worker/gpu_worker.py:436] Available KV cache memory: 58.53 GiB -(EngineCore pid=243) INFO 04-22 00:26:42 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9577 to maintain the same effective KV cache size. -(EngineCore pid=243) INFO 04-22 00:26:42 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 2,192,000 tokens -(EngineCore pid=243) INFO 04-22 00:26:42 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 267.58x -(EngineCore pid=243) 2026-04-22 00:26:42,679 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=243) DEBUG 04-22 00:26:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) 2026-04-22 00:26:42,690 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=243) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:01:19 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:01:19 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 01:01:19 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:01:19 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 01:01:19 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 01:01:19 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model Qwen/Qwen2.5-72B-Instruct -(APIServer pid=1) INFO 04-22 01:01:19 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 01:01:19 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:01:19 [entrypoints/utils.py:233] non-default args: {'model_tag': 'Qwen/Qwen2.5-72B-Instruct', 'model': 'Qwen/Qwen2.5-72B-Instruct', 'max_model_len': 8192, 'tensor_parallel_size': 2, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 01:01:19 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 01:01:20 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen2.Qwen2ForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 01:01:20 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003588 secs -(APIServer pid=1) INFO 04-22 01:01:20 [config/model.py:549] Resolved architecture: Qwen2ForCausalLM -(APIServer pid=1) INFO 04-22 01:01:20 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 01:01:20 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 01:01:20 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 01:01:20 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 01:01:20 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 01:01:20 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 01:01:20 [config/parallel.py:743] Defaulting to use mp for distributed inference -(APIServer pid=1) INFO 04-22 01:01:20 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 01:01:20 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(APIServer pid=1) INFO 04-22 01:01:20 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(APIServer pid=1) DEBUG 04-22 01:01:20 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 01:01:20 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:01:21 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:01:21 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 01:01:25 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:01:25 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:01:25 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:01:25 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:01:25 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:01:25 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:01:25 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:01:25 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:01:25 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:01:25 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:01:25 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:01:25 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:01:29 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=245) DEBUG 04-22 01:01:31 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 01:01:31 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=245) DEBUG 04-22 01:01:31 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/de8dec1b-171c-409d-b94e-f241299493cf'], outputs=['ipc:///tmp/7fdd1015-9215-4d2b-8f42-450f7d7dce64'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=245) DEBUG 04-22 01:01:31 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=245) DEBUG 04-22 01:01:31 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=245) DEBUG 04-22 01:01:31 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=245) DEBUG 04-22 01:01:31 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=245) DEBUG 04-22 01:01:31 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=245) INFO 04-22 01:01:31 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='Qwen/Qwen2.5-72B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-72B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen2.5-72B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [4096, 8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=245) WARNING 04-22 01:01:31 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. -(EngineCore pid=245) INFO 04-22 01:01:31 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.131.7.85 (local), world_size=2, local_world_size=2 -(EngineCore pid=245) DEBUG 04-22 01:01:31 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/8224b69f-9000-4fea-967e-665224f2c06f -(EngineCore pid=245) DEBUG 04-22 01:01:31 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1], buffer_handle=(2, 16777216, 10, 'psm_c7b9805d'), local_subscribe_addr='ipc:///tmp/8224b69f-9000-4fea-967e-665224f2c06f', local_notify_addr='ipc:///tmp/57224114-0656-4088-aa59-2c00950b7e84', remote_subscribe_addr=None, remote_addr_ipv6=False) -DEBUG 04-22 01:01:34 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:01:34 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:01:34 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:01:34 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:01:34 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:01:34 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:01:34 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:01:34 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:01:34 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:01:34 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:01:34 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:01:34 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:01:34 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:01:34 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:01:34 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:01:34 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:01:34 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:01:34 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:01:34 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:01:34 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:01:34 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:01:34 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:01:34 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:01:34 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:01:39 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:01:39 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:01:40 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:01:40 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:01:40 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:01:40 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 01:01:40 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:01:40 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:01:40 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:01:40 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(Worker pid=444) DEBUG 04-22 01:01:41 [distributed/parallel_state.py:1356] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:33837 backend=nccl -(Worker pid=444) INFO 04-22 01:01:41 [distributed/parallel_state.py:1400] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:33837 backend=nccl -(APIServer pid=1) DEBUG 04-22 01:01:41 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker pid=445) DEBUG 04-22 01:01:41 [distributed/parallel_state.py:1356] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:33837 backend=nccl -(Worker pid=445) INFO 04-22 01:01:41 [distributed/parallel_state.py:1400] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:33837 backend=nccl -[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -(Worker pid=444) DEBUG 04-22 01:01:41 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=445) DEBUG 04-22 01:01:41 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -(Worker pid=444) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=444) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=445) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=445) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=444) DEBUG 04-22 01:01:41 [utils/nccl.py:34] Found nccl from library libnccl.so.2 -(Worker pid=444) INFO 04-22 01:01:41 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 -(Worker pid=444) DEBUG 04-22 01:01:42 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=445) DEBUG 04-22 01:01:42 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=444) DEBUG 04-22 01:01:42 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/195cd3ba-554d-48ba-a941-6f3e1935529a -(Worker pid=444) DEBUG 04-22 01:01:42 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1], buffer_handle=(1, 4194304, 6, 'psm_8bfefc5e'), local_subscribe_addr='ipc:///tmp/195cd3ba-554d-48ba-a941-6f3e1935529a', local_notify_addr='ipc:///tmp/393fc9cb-2dfe-4d5e-ba19-e7442b94a71a', remote_subscribe_addr=None, remote_addr_ipv6=False) -(Worker pid=445) DEBUG 04-22 01:01:42 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/195cd3ba-554d-48ba-a941-6f3e1935529a -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(Worker pid=444) INFO 04-22 01:01:42 [distributed/parallel_state.py:1716] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(Worker pid=444) DEBUG 04-22 01:01:42 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.66GiB, total_memory=79.19GiB, cuda_memory=1.53GiB, torch_memory=0.02GiB, non_torch_memory=1.51GiB, timestamp=1776819702.890159, auto_measure=True -(Worker pid=444) DEBUG 04-22 01:01:42 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=445) DEBUG 04-22 01:01:42 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.66GiB, total_memory=79.19GiB, cuda_memory=1.53GiB, torch_memory=0.02GiB, non_torch_memory=1.51GiB, timestamp=1776819702.933578, auto_measure=True -(Worker pid=445) DEBUG 04-22 01:01:42 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=444) DEBUG 04-22 01:01:42 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=444) DEBUG 04-22 01:01:42 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=444) DEBUG 04-22 01:01:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=444) DEBUG 04-22 01:01:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=445) DEBUG 04-22 01:01:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=445) DEBUG 04-22 01:01:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=444) DEBUG 04-22 01:01:43 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(Worker pid=445) DEBUG 04-22 01:01:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=444) DEBUG 04-22 01:01:43 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=445) DEBUG 04-22 01:01:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=445) DEBUG 04-22 01:01:43 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=444) DEBUG 04-22 01:01:43 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(Worker_TP0 pid=444) INFO 04-22 01:01:43 [v1/worker/gpu_model_runner.py:4735] Starting to load model Qwen/Qwen2.5-72B-Instruct... -(Worker_TP0 pid=444) DEBUG 04-22 01:01:43 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(Worker_TP0 pid=444) INFO 04-22 01:01:43 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(Worker_TP0 pid=444) INFO 04-22 01:01:43 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(Worker_TP0 pid=444) DEBUG 04-22 01:01:43 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP0 pid=444) DEBUG 04-22 01:01:43 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP0 pid=444) DEBUG 04-22 01:01:43 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP0 pid=444) DEBUG 04-22 01:01:43 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP1 pid=445) DEBUG 04-22 01:01:43 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP1 pid=445) DEBUG 04-22 01:01:43 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP1 pid=445) DEBUG 04-22 01:01:43 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP1 pid=445) DEBUG 04-22 01:01:43 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP1 pid=445) DEBUG 04-22 01:01:44 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00016-of-00037.safetensors', 'model-00029-of-00037.safetensors', 'model-00003-of-00037.safetensors', 'model-00023-of-00037.safetensors', 'model-00019-of-00037.safetensors', 'model-00011-of-00037.safetensors', 'model-00035-of-00037.safetensors', 'model-00037-of-00037.safetensors', 'model-00024-of-00037.safetensors', 'model-00002-of-00037.safetensors', 'model-00006-of-00037.safetensors', 'model-00027-of-00037.safetensors', 'model-00018-of-00037.safetensors', 'model-00033-of-00037.safetensors', 'model-00009-of-00037.safetensors', 'model-00001-of-00037.safetensors', 'model-00004-of-00037.safetensors', 'model-00031-of-00037.safetensors', 'model-00012-of-00037.safetensors', 'model-00021-of-00037.safetensors', 'model-00026-of-00037.safetensors', 'model-00017-of-00037.safetensors', 'model-00032-of-00037.safetensors', 'model-00022-of-00037.safetensors', 'model-00028-of-00037.safetensors', 'model-00030-of-00037.safetensors', 'model-00013-of-00037.safetensors', 'model-00010-of-00037.safetensors', 'model-00008-of-00037.safetensors', 'model-00034-of-00037.safetensors', 'model-00007-of-00037.safetensors', 'model-00025-of-00037.safetensors', 'model-00020-of-00037.safetensors', 'model-00015-of-00037.safetensors', 'model-00036-of-00037.safetensors', 'model-00005-of-00037.safetensors', 'model-00014-of-00037.safetensors']] -(Worker_TP0 pid=444) DEBUG 04-22 01:01:46 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00020-of-00037.safetensors', 'model-00010-of-00037.safetensors', 'model-00001-of-00037.safetensors', 'model-00024-of-00037.safetensors', 'model-00036-of-00037.safetensors', 'model-00034-of-00037.safetensors', 'model-00007-of-00037.safetensors', 'model-00005-of-00037.safetensors', 'model-00029-of-00037.safetensors', 'model-00006-of-00037.safetensors', 'model-00011-of-00037.safetensors', 'model-00031-of-00037.safetensors', 'model-00008-of-00037.safetensors', 'model-00030-of-00037.safetensors', 'model-00018-of-00037.safetensors', 'model-00017-of-00037.safetensors', 'model-00022-of-00037.safetensors', 'model-00019-of-00037.safetensors', 'model-00021-of-00037.safetensors', 'model-00004-of-00037.safetensors', 'model-00012-of-00037.safetensors', 'model-00037-of-00037.safetensors', 'model-00026-of-00037.safetensors', 'model-00027-of-00037.safetensors', 'model-00033-of-00037.safetensors', 'model-00003-of-00037.safetensors', 'model-00015-of-00037.safetensors', 'model-00016-of-00037.safetensors', 'model-00014-of-00037.safetensors', 'model-00002-of-00037.safetensors', 'model-00013-of-00037.safetensors', 'model-00028-of-00037.safetensors', 'model-00035-of-00037.safetensors', 'model-00009-of-00037.safetensors', 'model-00023-of-00037.safetensors', 'model-00025-of-00037.safetensors', 'model-00032-of-00037.safetensors']] -(Worker_TP0 pid=444) Loading safetensors checkpoint shards: 0% Completed | 0/37 [00:00 -(Worker_TP0 pid=444) DEBUG 04-22 01:03:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 01:03:25 [compilation/decorators.py:528] Start compiling function -(EngineCore pid=245) DEBUG 04-22 01:03:26 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(APIServer pid=1) DEBUG 04-22 01:03:31 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=a494131711 comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/93b5502620/rank_1_0/backbone -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] Vllm config hash: a494131711 -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP0 pid=444) INFO 04-22 01:03:34 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/93b5502620/rank_0_0/backbone for vLLM's torch.compile -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=a494131711 comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/93b5502620/rank_0_0/backbone -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/backends.py:1074] Vllm config hash: a494131711 -(Worker_TP0 pid=444) INFO 04-22 01:03:34 [compilation/backends.py:1111] Dynamo bytecode transform time: 8.74 s -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [compilation/.../fusion/allreduce_rms_fusion.py:765] Flashinfer max size: 64 MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: 4096 -(Worker_TP0 pid=444) INFO 04-22 01:03:34 [distributed/device_communicators/flashinfer_all_reduce.py:109] Auto-selected flashinfer allreduce backend: trtllm -(Worker_TP0 pid=444) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. -(Worker_TP0 pid=444) return func(*args, **kwargs) -(Worker_TP0 pid=444) DEBUG 04-22 01:03:34 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=0, max_token_num=4096, hidden_dim=8192, dtype=torch.bfloat16 -(Worker_TP1 pid=445) DEBUG 04-22 01:03:34 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=1, max_token_num=4096, hidden_dim=8192, dtype=torch.bfloat16 -(Worker_TP0 pid=444) INFO 04-22 01:03:34 [distributed/device_communicators/flashinfer_all_reduce.py:149] Initialized FlashInfer Allreduce norm fusion workspace with backend=trtllm -(Worker_TP0 pid=444) DEBUG 04-22 01:03:35 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 4096), (4097, 8192)] -(Worker_TP0 pid=444) DEBUG 04-22 01:03:35 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(Worker_TP1 pid=445) DEBUG 04-22 01:03:36 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=445) DEBUG 04-22 01:03:36 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:03:36 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=444) DEBUG 04-22 01:03:36 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:03:36 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns -(Worker_TP1 pid=445) DEBUG 04-22 01:03:36 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 33.4 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:03:36 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:03:36 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes -(Worker_TP1 pid=445) DEBUG 04-22 01:03:36 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:03:36 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns -(Worker_TP0 pid=444) DEBUG 04-22 01:03:36 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 34.4 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:03:36 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:03:36 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes -(Worker_TP0 pid=444) DEBUG 04-22 01:03:36 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=444) INFO 04-22 01:03:38 [compilation/backends.py:372] Cache the graph of compile range (1, 4096) for later use -(Worker_TP0 pid=444) DEBUG 04-22 01:03:38 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 4096) from inductor_standalone via handle ('artifact_compile_range_1_4096_subgraph_0', '/data/.cache/vllm/torch_compile_cache/93b5502620/rank_0_0/backbone/artifact_compile_range_1_4096_subgraph_0') -(Worker_TP0 pid=444) DEBUG 04-22 01:03:39 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=444) DEBUG 04-22 01:03:39 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:03:39 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) -(Worker_TP0 pid=444) DEBUG 04-22 01:03:39 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:03:39 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=444) DEBUG 04-22 01:03:39 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:03:39 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=445) DEBUG 04-22 01:03:39 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:03:39 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) -(Worker_TP1 pid=445) DEBUG 04-22 01:03:39 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:03:39 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=445) DEBUG 04-22 01:03:39 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=444) INFO 04-22 01:03:39 [compilation/backends.py:372] Cache the graph of compile range (4097, 8192) for later use -(Worker_TP0 pid=444) DEBUG 04-22 01:03:39 [compilation/backends.py:377] Store the 0-th graph for compile range(4097, 8192) from inductor_standalone via handle ('artifact_compile_range_4097_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/93b5502620/rank_0_0/backbone/artifact_compile_range_4097_8192_subgraph_0') -(Worker_TP0 pid=444) DEBUG 04-22 01:03:40 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=444) DEBUG 04-22 01:03:40 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:03:40 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=445) DEBUG 04-22 01:03:40 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:03:40 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP0 pid=444) DEBUG 04-22 01:03:40 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 60.5 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:03:40 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:03:40 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP0 pid=444) DEBUG 04-22 01:03:40 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:03:40 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP1 pid=445) DEBUG 04-22 01:03:40 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 60.6 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:03:40 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:03:40 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP1 pid=445) DEBUG 04-22 01:03:40 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:03:41 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 4096) from inductor_standalone via handle ('artifact_compile_range_1_4096_subgraph_1', '/data/.cache/vllm/torch_compile_cache/93b5502620/rank_0_0/backbone/artifact_compile_range_1_4096_subgraph_1') -(Worker_TP0 pid=444) DEBUG 04-22 01:03:41 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=444) DEBUG 04-22 01:03:41 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:03:41 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) -(Worker_TP0 pid=444) DEBUG 04-22 01:03:41 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:03:41 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=444) DEBUG 04-22 01:03:41 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(APIServer pid=1) DEBUG 04-22 01:03:41 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP1 pid=445) DEBUG 04-22 01:03:41 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=445) DEBUG 04-22 01:03:41 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:03:41 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) -(Worker_TP1 pid=445) DEBUG 04-22 01:03:41 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:03:41 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=445) DEBUG 04-22 01:03:41 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:03:43 [compilation/backends.py:377] Store the 1-th graph for compile range(4097, 8192) from inductor_standalone via handle ('artifact_compile_range_4097_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/93b5502620/rank_0_0/backbone/artifact_compile_range_4097_8192_subgraph_1') -(Worker_TP0 pid=444) DEBUG 04-22 01:03:48 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=444) DEBUG 04-22 01:03:48 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:03:48 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=445) DEBUG 04-22 01:03:48 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:03:48 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP0 pid=444) DEBUG 04-22 01:03:48 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 62.7 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:03:48 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:03:48 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP0 pid=444) DEBUG 04-22 01:03:48 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:03:48 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP1 pid=445) DEBUG 04-22 01:03:48 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 62.6 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:03:48 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:03:48 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP1 pid=445) DEBUG 04-22 01:03:48 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:03:48 [compilation/backends.py:377] Store the 80-th graph for compile range(1, 4096) from inductor_standalone via handle ('artifact_compile_range_1_4096_subgraph_80', '/data/.cache/vllm/torch_compile_cache/93b5502620/rank_0_0/backbone/artifact_compile_range_1_4096_subgraph_80') -(Worker_TP0 pid=444) INFO 04-22 01:03:48 [compilation/backends.py:390] Compiling a graph for compile range (1, 4096) takes 9.89 s -(Worker_TP0 pid=444) DEBUG 04-22 01:03:48 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=444) DEBUG 04-22 01:03:48 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:03:48 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) -(Worker_TP0 pid=444) DEBUG 04-22 01:03:48 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:03:48 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=444) DEBUG 04-22 01:03:48 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:03:48 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=445) DEBUG 04-22 01:03:48 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:03:48 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) -(Worker_TP1 pid=445) DEBUG 04-22 01:03:48 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:03:48 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=445) DEBUG 04-22 01:03:48 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:03:49 [compilation/backends.py:377] Store the 80-th graph for compile range(4097, 8192) from inductor_standalone via handle ('artifact_compile_range_4097_8192_subgraph_80', '/data/.cache/vllm/torch_compile_cache/93b5502620/rank_0_0/backbone/artifact_compile_range_4097_8192_subgraph_80') -(Worker_TP0 pid=444) INFO 04-22 01:03:49 [compilation/backends.py:390] Compiling a graph for compile range (4097, 8192) takes 10.81 s -(Worker_TP0 pid=444) DEBUG 04-22 01:03:49 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/93b5502620/rank_0_0/backbone/computation_graph.py -(APIServer pid=1) DEBUG 04-22 01:03:51 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=444) INFO 04-22 01:03:52 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/2a4892efff25a14a24b24f1dce28c4ec46ef2e01c74d72f0a4f8bb5a27b1f34a/rank_0_0/model -(Worker_TP0 pid=444) INFO 04-22 01:03:52 [compilation/monitor.py:48] torch.compile took 26.78 s in total -(Worker_TP0 pid=444) INFO 04-22 01:03:53 [compilation/monitor.py:76] Initial profiling/warmup run took 1.37 s -(Worker_TP1 pid=445) INFO 04-22 01:03:59 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP1 pid=445) DEBUG 04-22 01:03:59 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP1 pid=445) INFO 04-22 01:03:59 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_TP0 pid=444) INFO 04-22 01:03:59 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP0 pid=444) DEBUG 04-22 01:03:59 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP0 pid=444) INFO 04-22 01:03:59 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_TP1 pid=445) DEBUG 04-22 01:03:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 01:03:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 01:03:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 01:03:59 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=444) DEBUG 04-22 01:03:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 01:03:59 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=445) DEBUG 04-22 01:03:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 01:03:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 01:03:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 01:03:59 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=444) DEBUG 04-22 01:03:59 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 01:03:59 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=444) DEBUG 04-22 01:04:00 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 152.00 MiB first-capture + (51-1) × 18.00 MiB per-graph -(Worker_TP1 pid=445) DEBUG 04-22 01:04:00 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 152.00 MiB first-capture + (51-1) × 18.00 MiB per-graph -(Worker_TP0 pid=444) DEBUG 04-22 01:04:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 01:04:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 01:04:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 01:04:00 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=444) DEBUG 04-22 01:04:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 01:04:00 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=445) DEBUG 04-22 01:04:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 01:04:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 01:04:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 01:04:00 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=444) DEBUG 04-22 01:04:00 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 01:04:00 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=445) DEBUG 04-22 01:04:00 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 266.00 MiB first-capture + (51-1) × 10.00 MiB per-graph -(Worker_TP1 pid=445) INFO 04-22 01:04:00 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses -(Worker_TP0 pid=444) DEBUG 04-22 01:04:00 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 266.00 MiB first-capture + (51-1) × 10.00 MiB per-graph -(Worker_TP0 pid=444) INFO 04-22 01:04:00 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses -(Worker_TP0 pid=444) DEBUG 04-22 01:04:01 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP0 pid=444) INFO 04-22 01:04:01 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.63 GiB total -(Worker_TP1 pid=445) DEBUG 04-22 01:04:01 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP1 pid=445) INFO 04-22 01:04:01 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.63 GiB total -(APIServer pid=1) DEBUG 04-22 01:04:01 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP1 pid=445) DEBUG 04-22 01:04:01 [v1/worker/gpu_worker.py:424] Initial free memory: 77.66 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP1 pid=445) DEBUG 04-22 01:04:01 [v1/worker/gpu_worker.py:430] Free memory after profiling: 6.39 GiB (total), 3.96 GiB (within requested) -(Worker_TP1 pid=445) DEBUG 04-22 01:04:01 [v1/worker/gpu_worker.py:435] Memory profiling takes 35.86 seconds. Total non KV cache memory: 72.19GiB; torch peak memory increase: 2.29GiB; non-torch forward increase memory: 2.09GiB; weights memory: 67.8GiB. -(Worker_TP1 pid=445) INFO 04-22 01:04:01 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9705 to maintain the same effective KV cache size. -(Worker_TP1 pid=445) DEBUG 04-22 01:04:01 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=444) DEBUG 04-22 01:04:01 [v1/worker/gpu_worker.py:424] Initial free memory: 77.66 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP0 pid=444) DEBUG 04-22 01:04:01 [v1/worker/gpu_worker.py:430] Free memory after profiling: 6.39 GiB (total), 3.96 GiB (within requested) -(Worker_TP0 pid=444) DEBUG 04-22 01:04:01 [v1/worker/gpu_worker.py:435] Memory profiling takes 35.86 seconds. Total non KV cache memory: 72.19GiB; torch peak memory increase: 2.29GiB; non-torch forward increase memory: 2.09GiB; weights memory: 67.8GiB. -(Worker_TP0 pid=444) INFO 04-22 01:04:01 [v1/worker/gpu_worker.py:436] Available KV cache memory: 3.04 GiB -(Worker_TP0 pid=444) INFO 04-22 01:04:01 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9705 to maintain the same effective KV cache size. -(Worker_TP0 pid=444) DEBUG 04-22 01:04:01 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=245) DEBUG 04-22 01:04:01 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=245) INFO 04-22 01:04:01 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 19,920 tokens -(EngineCore pid=245) INFO 04-22 01:04:01 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 2.43x -(Worker_TP0 pid=444) DEBUG 04-22 01:04:01 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=445) DEBUG 04-22 01:04:01 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=444) INFO 04-22 01:04:01 [v1/worker/gpu_worker.py:578] Compile and warming up model for size 8192 -(Worker_TP1 pid=445) INFO 04-22 01:04:01 [v1/worker/gpu_worker.py:578] Compile and warming up model for size 8192 -(Worker_TP0 pid=444) DEBUG 04-22 01:04:01 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 01:04:01 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) 2026-04-22 01:04:01,715 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP1 pid=445) 2026-04-22 01:04:01,715 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP0 pid=444) DEBUG 04-22 01:04:01 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 01:04:01 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=245) DEBUG 04-22 01:04:02 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=444) 2026-04-22 01:04:02,600 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP1 pid=445) 2026-04-22 01:04:02,600 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP0 pid=444) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=245) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=245) INFO 04-22 01:04:13 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(Worker_TP0 pid=444) DEBUG 04-22 01:04:13 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=445) DEBUG 04-22 01:04:13 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=245) DEBUG 04-22 01:04:13 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=245) DEBUG 04-22 01:04:13 [v1/engine/core.py:1158] EngineCore waiting for work. -(EngineCore pid=245) DEBUG 04-22 01:04:13 [v1/engine/core.py:1158] EngineCore waiting for work. -(APIServer pid=1) INFO 04-22 01:04:13 [entrypoints/openai/api_server.py:590] Supported tasks: ['generate'] -(APIServer pid=1) WARNING 04-22 01:04:13 [config/model.py:1435] Default vLLM sampling parameters have been overridden by the model's `generation_config.json`: `{'repetition_penalty': 1.05, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8}`. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`. -(APIServer pid=1) DEBUG 04-22 01:04:13 [renderers/base.py:197] Warming up chat template processing... -(APIServer pid=1) DEBUG 04-22 01:04:14 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e81e8d-656378cd6e455ee10a8c4389;0c25ab62-8556-4c4a-ae58-f8c6b6ad7f4c) -(APIServer pid=1) DEBUG 04-22 01:04:14 [transformers_utils/repo_utils.py:243] -(APIServer pid=1) DEBUG 04-22 01:04:14 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/Qwen/Qwen2.5-72B-Instruct/resolve/main/processor_config.json. -(APIServer pid=1) DEBUG 04-22 01:04:14 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e81e8e-69d19a9d3a125bad77599992;b4b8e27f-b6b6-4c43-8945-4a13b3734d5e) -(APIServer pid=1) DEBUG 04-22 01:04:14 [transformers_utils/repo_utils.py:243] -(APIServer pid=1) DEBUG 04-22 01:04:14 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/Qwen/Qwen2.5-72B-Instruct/resolve/main/preprocessor_config.json. -(Worker_TP0 pid=444) DEBUG 04-22 01:04:14 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=445) DEBUG 04-22 01:04:14 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(APIServer pid=1) INFO 04-22 01:04:14 [renderers/hf.py:314] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this. -(APIServer pid=1) DEBUG 04-22 01:04:14 [renderers/base.py:203] Chat template warmup completed in 0.664s -(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/openai/api_server.py:594] Starting vLLM server on http://0.0.0.0:8000 -(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:37] Available routes are: -(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /openapi.json, Methods: HEAD, GET -(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /docs, Methods: HEAD, GET -(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /docs/oauth2-redirect, Methods: HEAD, GET -(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /redoc, Methods: HEAD, GET -(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /tokenize, Methods: POST -(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /detokenize, Methods: POST -(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /load, Methods: GET -(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /version, Methods: GET -(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /health, Methods: GET -(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /metrics, Methods: GET -(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /v1/models, Methods: GET -(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /ping, Methods: GET -(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /ping, Methods: POST -(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /invocations, Methods: POST -(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /v1/chat/completions, Methods: POST -(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST -(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /v1/responses, Methods: POST -(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET -(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST -(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /v1/completions, Methods: POST -(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /v1/messages, Methods: POST -(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST -(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /inference/v1/generate, Methods: POST -(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /scale_elastic_ep, Methods: POST -(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST -(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /v1/chat/completions/render, Methods: POST -(APIServer pid=1) INFO 04-22 01:04:14 [entrypoints/launcher.py:46] Route: /v1/completions/render, Methods: POST -(APIServer pid=1) INFO: Started server process [1] -(APIServer pid=1) INFO: Waiting for application startup. -(APIServer pid=1) INFO: Application startup complete. -(APIServer pid=1) DEBUG 04-22 01:04:18 [v1/engine/async_llm.py:875] Called check_health. -(APIServer pid=1) INFO: 10.131.6.2:33822 - "GET /health HTTP/1.1" 200 OK diff --git a/accuracy/results/v0.19.0/logs/qwen-qwen2-5-7b-i--h100-80gb--tp1pp1dp1--8192-dtbf16.log b/accuracy/results/v0.19.0/logs/qwen-qwen2-5-7b-i--h100-80gb--tp1pp1dp1--8192-dtbf16.log deleted file mode 100644 index 472a90e9..00000000 --- a/accuracy/results/v0.19.0/logs/qwen-qwen2-5-7b-i--h100-80gb--tp1pp1dp1--8192-dtbf16.log +++ /dev/null @@ -1,746 +0,0 @@ -DEBUG 04-22 00:24:34 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:24:34 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:24:34 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:24:34 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:24:34 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:24:34 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:24:34 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:24:34 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:24:34 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:24:34 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:24:34 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:24:34 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:24:39 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 00:24:41 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' -DEBUG 04-22 00:24:41 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 00:24:41 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:24:41 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:24:41 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 00:24:41 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:24:41 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 00:24:41 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 00:24:41 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model Qwen/Qwen2.5-7B-Instruct -(APIServer pid=1) INFO 04-22 00:24:41 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 00:24:41 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:24:41 [entrypoints/utils.py:233] non-default args: {'model_tag': 'Qwen/Qwen2.5-7B-Instruct', 'model': 'Qwen/Qwen2.5-7B-Instruct', 'dtype': 'bfloat16', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 00:24:41 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 00:24:41 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen2.Qwen2ForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 00:24:41 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003835 secs -(APIServer pid=1) INFO 04-22 00:24:41 [config/model.py:549] Resolved architecture: Qwen2ForCausalLM -(APIServer pid=1) INFO 04-22 00:24:41 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 00:24:41 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 00:24:41 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 00:24:41 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 00:24:41 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 00:24:41 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 00:24:41 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 00:24:41 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 00:24:41 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 00:24:41 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:24:42 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:24:42 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 00:24:45 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:24:45 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:24:45 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:24:45 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:24:45 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:24:45 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:24:45 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:24:45 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:24:45 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:24:45 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:24:45 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:24:45 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:24:50 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=244) DEBUG 04-22 00:24:51 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 00:24:51 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=244) DEBUG 04-22 00:24:51 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/7e7d4e07-bf7f-4693-b1d1-eb24ab15edc5'], outputs=['ipc:///tmp/6b4872f8-b412-42ab-a84a-a9ba659e3875'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=244) DEBUG 04-22 00:24:51 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=244) DEBUG 04-22 00:24:52 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=244) DEBUG 04-22 00:24:52 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=244) DEBUG 04-22 00:24:52 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=244) DEBUG 04-22 00:24:52 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=244) INFO 04-22 00:24:52 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='Qwen/Qwen2.5-7B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-7B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen2.5-7B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=244) DEBUG 04-22 00:24:52 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.4.194:50973 backend=nccl -(EngineCore pid=244) INFO 04-22 00:24:52 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.4.194:50973 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=244) DEBUG 04-22 00:24:52 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=244) INFO 04-22 00:24:52 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=244) DEBUG 04-22 00:24:52 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776817492.8586414, auto_measure=True -(EngineCore pid=244) DEBUG 04-22 00:24:52 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=244) DEBUG 04-22 00:24:52 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=244) DEBUG 04-22 00:24:52 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=244) DEBUG 04-22 00:24:52 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=244) DEBUG 04-22 00:24:52 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=244) DEBUG 04-22 00:24:53 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=244) DEBUG 04-22 00:24:53 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=244) DEBUG 04-22 00:24:53 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=244) INFO 04-22 00:24:53 [v1/worker/gpu_model_runner.py:4735] Starting to load model Qwen/Qwen2.5-7B-Instruct... -(EngineCore pid=244) DEBUG 04-22 00:24:53 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=244) INFO 04-22 00:24:53 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=244) INFO 04-22 00:24:53 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=244) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=244) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=244) DEBUG 04-22 00:24:53 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=244) DEBUG 04-22 00:24:53 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=244) DEBUG 04-22 00:24:53 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 57, 'silu_and_mul': 28, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=244) DEBUG 04-22 00:24:53 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=244) DEBUG 04-22 00:24:54 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00001-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00002-of-00004.safetensors']] -(EngineCore pid=244) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=244) INFO 04-22 00:25:11 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/7a80f98c72/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=1a6080d116 comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/7a80f98c72/rank_0_0/backbone -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/backends.py:1074] Vllm config hash: 1a6080d116 -(EngineCore pid=244) INFO 04-22 00:25:11 [compilation/backends.py:1111] Dynamo bytecode transform time: 3.50 s -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=244) DEBUG 04-22 00:25:11 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(APIServer pid=1) DEBUG 04-22 00:25:12 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=244) DEBUG 04-22 00:25:12 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=244) DEBUG 04-22 00:25:12 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(EngineCore pid=244) DEBUG 04-22 00:25:12 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms -(EngineCore pid=244) DEBUG 04-22 00:25:12 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=244) DEBUG 04-22 00:25:12 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=244) INFO 04-22 00:25:13 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=244) DEBUG 04-22 00:25:13 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/7a80f98c72/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=244) DEBUG 04-22 00:25:14 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=244) DEBUG 04-22 00:25:14 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(EngineCore pid=244) DEBUG 04-22 00:25:14 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.2 ms -(EngineCore pid=244) DEBUG 04-22 00:25:14 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=244) DEBUG 04-22 00:25:14 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=244) DEBUG 04-22 00:25:15 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/7a80f98c72/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=244) DEBUG 04-22 00:25:16 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=244) DEBUG 04-22 00:25:16 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms -(EngineCore pid=244) DEBUG 04-22 00:25:16 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms -(EngineCore pid=244) DEBUG 04-22 00:25:16 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=244) DEBUG 04-22 00:25:16 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(EngineCore pid=244) DEBUG 04-22 00:25:16 [compilation/backends.py:377] Store the 28-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_28', '/data/.cache/vllm/torch_compile_cache/7a80f98c72/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_28') -(EngineCore pid=244) INFO 04-22 00:25:16 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.04 s -(EngineCore pid=244) DEBUG 04-22 00:25:16 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/7a80f98c72/rank_0_0/backbone/computation_graph.py -(EngineCore pid=244) INFO 04-22 00:25:17 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/17c675e99ca757334c44a733bfd7950765a3ca0d859504322713f76ad4aa0025/rank_0_0/model -(EngineCore pid=244) INFO 04-22 00:25:17 [compilation/monitor.py:48] torch.compile took 10.20 s in total -(EngineCore pid=244) INFO 04-22 00:25:18 [compilation/monitor.py:76] Initial profiling/warmup run took 0.43 s -(APIServer pid=1) DEBUG 04-22 00:25:22 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=244) INFO 04-22 00:25:23 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=244) DEBUG 04-22 00:25:23 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=244) INFO 04-22 00:25:23 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=244) DEBUG 04-22 00:25:23 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:25:23 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:25:23 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-22 00:25:23 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:25:23 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:25:23 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-22 00:25:23 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 160.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(EngineCore pid=244) DEBUG 04-22 00:25:23 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:25:23 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:25:23 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-22 00:25:23 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:25:23 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:25:23 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-22 00:25:23 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 228.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(EngineCore pid=244) DEBUG 04-22 00:25:24 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=244) INFO 04-22 00:25:24 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.61 GiB total -(EngineCore pid=244) DEBUG 04-22 00:25:24 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=244) DEBUG 04-22 00:25:24 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.98 GiB (total), 59.53 GiB (within requested) -(EngineCore pid=244) DEBUG 04-22 00:25:24 [v1/worker/gpu_worker.py:435] Memory profiling takes 17.20 seconds. Total non KV cache memory: 16.7GiB; torch peak memory increase: 2.21GiB; non-torch forward increase memory: 0.24GiB; weights memory: 14.25GiB. -(EngineCore pid=244) INFO 04-22 00:25:24 [v1/worker/gpu_worker.py:436] Available KV cache memory: 58.53 GiB -(EngineCore pid=244) INFO 04-22 00:25:24 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9577 to maintain the same effective KV cache size. -(EngineCore pid=244) INFO 04-22 00:25:24 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 1,096,000 tokens -(EngineCore pid=244) INFO 04-22 00:25:24 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 133.79x -(EngineCore pid=244) 2026-04-22 00:25:25,237 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=244) DEBUG 04-22 00:25:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) 2026-04-22 00:25:25,244 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=244) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:51:46 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:51:46 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 01:51:47 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:51:47 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 01:51:47 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 01:51:47 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model Qwen/Qwen2.5-7B-Instruct -(APIServer pid=1) INFO 04-22 01:51:47 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 01:51:47 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:51:47 [entrypoints/utils.py:233] non-default args: {'model_tag': 'Qwen/Qwen2.5-7B-Instruct', 'model': 'Qwen/Qwen2.5-7B-Instruct', 'max_model_len': 16384, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 01:51:47 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 01:51:47 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen2.Qwen2ForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 01:51:47 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003766 secs -(APIServer pid=1) INFO 04-22 01:51:47 [config/model.py:549] Resolved architecture: Qwen2ForCausalLM -(APIServer pid=1) INFO 04-22 01:51:47 [config/model.py:1678] Using max model len 16384 -(APIServer pid=1) DEBUG 04-22 01:51:47 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 01:51:47 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 01:51:47 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 01:51:47 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 01:51:47 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 01:51:47 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 01:51:47 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 01:51:47 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 01:51:47 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:51:47 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:51:47 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 01:51:51 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:51:51 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:51:51 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:51:51 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:51:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:51:51 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:51:51 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:51:51 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:51:51 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:51:51 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:51:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:51:51 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:51:56 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=242) DEBUG 04-22 01:51:57 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 01:51:57 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=242) DEBUG 04-22 01:51:57 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/2c69b5ce-1cf7-406f-854c-eb1818608546'], outputs=['ipc:///tmp/779d537c-a3ed-4354-9f61-af1e644c1cb0'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=242) DEBUG 04-22 01:51:57 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=242) DEBUG 04-22 01:51:57 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=242) DEBUG 04-22 01:51:57 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=242) DEBUG 04-22 01:51:57 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=242) DEBUG 04-22 01:51:57 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=242) INFO 04-22 01:51:57 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='Qwen/Qwen2.5-7B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-7B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16384, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen2.5-7B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=242) DEBUG 04-22 01:51:58 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.41:55703 backend=nccl -(EngineCore pid=242) INFO 04-22 01:51:58 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.41:55703 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=242) DEBUG 04-22 01:51:58 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=242) INFO 04-22 01:51:58 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=242) DEBUG 04-22 01:51:58 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776822718.9894435, auto_measure=True -(EngineCore pid=242) DEBUG 04-22 01:51:58 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=242) DEBUG 04-22 01:51:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=242) DEBUG 04-22 01:51:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=242) DEBUG 04-22 01:51:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=242) DEBUG 04-22 01:51:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=242) DEBUG 04-22 01:51:59 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=242) DEBUG 04-22 01:51:59 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=242) DEBUG 04-22 01:51:59 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=242) INFO 04-22 01:51:59 [v1/worker/gpu_model_runner.py:4735] Starting to load model Qwen/Qwen2.5-7B-Instruct... -(EngineCore pid=242) DEBUG 04-22 01:51:59 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=242) INFO 04-22 01:51:59 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=242) INFO 04-22 01:51:59 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=242) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=242) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=242) DEBUG 04-22 01:51:59 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=242) DEBUG 04-22 01:51:59 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=242) DEBUG 04-22 01:51:59 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 57, 'silu_and_mul': 28, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=242) DEBUG 04-22 01:51:59 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=242) DEBUG 04-22 01:52:00 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00004-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00002-of-00004.safetensors', 'model-00001-of-00004.safetensors']] -(EngineCore pid=242) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=242) INFO 04-22 01:52:12 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/71bc44c872/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=759a699594 comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/71bc44c872/rank_0_0/backbone -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/backends.py:1074] Vllm config hash: 759a699594 -(EngineCore pid=242) INFO 04-22 01:52:12 [compilation/backends.py:1111] Dynamo bytecode transform time: 3.55 s -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=242) DEBUG 04-22 01:52:12 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=242) DEBUG 04-22 01:52:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=242) DEBUG 04-22 01:52:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(EngineCore pid=242) DEBUG 04-22 01:52:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms -(EngineCore pid=242) DEBUG 04-22 01:52:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=242) DEBUG 04-22 01:52:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=242) INFO 04-22 01:52:14 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=242) DEBUG 04-22 01:52:14 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/71bc44c872/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=242) DEBUG 04-22 01:52:15 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=242) DEBUG 04-22 01:52:15 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(EngineCore pid=242) DEBUG 04-22 01:52:15 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms -(EngineCore pid=242) DEBUG 04-22 01:52:15 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=242) DEBUG 04-22 01:52:15 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=242) DEBUG 04-22 01:52:16 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/71bc44c872/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=242) DEBUG 04-22 01:52:17 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=242) DEBUG 04-22 01:52:17 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.4 ms -(EngineCore pid=242) DEBUG 04-22 01:52:17 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms -(EngineCore pid=242) DEBUG 04-22 01:52:17 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=242) DEBUG 04-22 01:52:17 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(EngineCore pid=242) DEBUG 04-22 01:52:17 [compilation/backends.py:377] Store the 28-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_28', '/data/.cache/vllm/torch_compile_cache/71bc44c872/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_28') -(EngineCore pid=242) INFO 04-22 01:52:17 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.19 s -(EngineCore pid=242) DEBUG 04-22 01:52:17 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/71bc44c872/rank_0_0/backbone/computation_graph.py -(APIServer pid=1) DEBUG 04-22 01:52:17 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=242) INFO 04-22 01:52:18 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/3040b79a955eecaf30ddc5d0b4e18acda7953487e706565418941f6941fe40ca/rank_0_0/model -(EngineCore pid=242) INFO 04-22 01:52:18 [compilation/monitor.py:48] torch.compile took 10.40 s in total -(EngineCore pid=242) INFO 04-22 01:52:19 [compilation/monitor.py:76] Initial profiling/warmup run took 0.44 s -(EngineCore pid=242) INFO 04-22 01:52:24 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=242) DEBUG 04-22 01:52:24 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=242) INFO 04-22 01:52:24 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=242) DEBUG 04-22 01:52:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=242) DEBUG 04-22 01:52:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=242) DEBUG 04-22 01:52:25 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=242) DEBUG 04-22 01:52:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=242) DEBUG 04-22 01:52:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=242) DEBUG 04-22 01:52:25 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=242) DEBUG 04-22 01:52:25 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 160.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(EngineCore pid=242) DEBUG 04-22 01:52:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=242) DEBUG 04-22 01:52:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=242) DEBUG 04-22 01:52:25 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=242) DEBUG 04-22 01:52:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=242) DEBUG 04-22 01:52:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=242) DEBUG 04-22 01:52:25 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=242) DEBUG 04-22 01:52:25 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 228.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(EngineCore pid=242) DEBUG 04-22 01:52:25 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=242) INFO 04-22 01:52:25 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.61 GiB total -(EngineCore pid=242) DEBUG 04-22 01:52:26 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=242) DEBUG 04-22 01:52:26 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.98 GiB (total), 59.53 GiB (within requested) -(EngineCore pid=242) DEBUG 04-22 01:52:26 [v1/worker/gpu_worker.py:435] Memory profiling takes 17.58 seconds. Total non KV cache memory: 16.7GiB; torch peak memory increase: 2.21GiB; non-torch forward increase memory: 0.24GiB; weights memory: 14.25GiB. -(EngineCore pid=242) INFO 04-22 01:52:26 [v1/worker/gpu_worker.py:436] Available KV cache memory: 58.53 GiB -(EngineCore pid=242) INFO 04-22 01:52:26 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9577 to maintain the same effective KV cache size. -(EngineCore pid=242) INFO 04-22 01:52:26 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 1,095,968 tokens -(EngineCore pid=242) INFO 04-22 01:52:26 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 16,384 tokens per request: 66.89x -(EngineCore pid=242) 2026-04-22 01:52:26,135 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=242) DEBUG 04-22 01:52:26 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=242) 2026-04-22 01:52:26,143 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=242) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:52:57 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:52:57 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 01:52:57 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:52:57 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 01:52:57 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 01:52:57 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model Qwen/Qwen2.5-7B-Instruct -(APIServer pid=1) INFO 04-22 01:52:57 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 01:52:57 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:52:57 [entrypoints/utils.py:233] non-default args: {'model_tag': 'Qwen/Qwen2.5-7B-Instruct', 'model': 'Qwen/Qwen2.5-7B-Instruct', 'max_model_len': 32768, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 01:52:57 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 01:52:58 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen2.Qwen2ForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 01:52:58 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0004144 secs -(APIServer pid=1) INFO 04-22 01:52:58 [config/model.py:549] Resolved architecture: Qwen2ForCausalLM -(APIServer pid=1) INFO 04-22 01:52:58 [config/model.py:1678] Using max model len 32768 -(APIServer pid=1) DEBUG 04-22 01:52:58 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 01:52:58 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 01:52:58 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 01:52:58 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 01:52:58 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 01:52:58 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 01:52:58 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 01:52:58 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 01:52:58 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:52:58 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:52:58 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 01:53:02 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:53:02 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:53:02 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:53:02 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:53:02 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:53:02 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:53:02 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:53:02 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:53:02 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:53:02 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:53:02 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:53:02 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:53:07 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=244) DEBUG 04-22 01:53:08 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 01:53:08 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=244) DEBUG 04-22 01:53:08 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/22293a4a-e2e4-42a4-80a9-340df849e155'], outputs=['ipc:///tmp/c14af7ba-85e0-4a40-9bdd-815d5a2e4143'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=244) DEBUG 04-22 01:53:08 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=244) DEBUG 04-22 01:53:08 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=244) DEBUG 04-22 01:53:08 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=244) DEBUG 04-22 01:53:08 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=244) DEBUG 04-22 01:53:08 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=244) INFO 04-22 01:53:08 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='Qwen/Qwen2.5-7B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-7B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen2.5-7B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=244) DEBUG 04-22 01:53:09 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.42:54577 backend=nccl -(EngineCore pid=244) INFO 04-22 01:53:09 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.42:54577 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=244) DEBUG 04-22 01:53:09 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=244) INFO 04-22 01:53:09 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=244) DEBUG 04-22 01:53:09 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776822789.6017933, auto_measure=True -(EngineCore pid=244) DEBUG 04-22 01:53:09 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=244) DEBUG 04-22 01:53:09 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=244) DEBUG 04-22 01:53:09 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=244) DEBUG 04-22 01:53:09 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=244) DEBUG 04-22 01:53:09 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=244) DEBUG 04-22 01:53:09 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=244) DEBUG 04-22 01:53:09 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=244) DEBUG 04-22 01:53:09 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=244) INFO 04-22 01:53:09 [v1/worker/gpu_model_runner.py:4735] Starting to load model Qwen/Qwen2.5-7B-Instruct... -(EngineCore pid=244) DEBUG 04-22 01:53:10 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=244) INFO 04-22 01:53:10 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=244) INFO 04-22 01:53:10 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=244) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=244) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=244) DEBUG 04-22 01:53:10 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=244) DEBUG 04-22 01:53:10 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=244) DEBUG 04-22 01:53:10 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 57, 'silu_and_mul': 28, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=244) DEBUG 04-22 01:53:10 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=244) DEBUG 04-22 01:53:10 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00001-of-00004.safetensors', 'model-00002-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00004-of-00004.safetensors']] -(EngineCore pid=244) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=244) INFO 04-22 01:53:22 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/517af7d22e/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=9fd3f9070d comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/517af7d22e/rank_0_0/backbone -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=244) DEBUG 04-22 01:53:22 [compilation/backends.py:1074] Vllm config hash: 9fd3f9070d -(EngineCore pid=244) INFO 04-22 01:53:22 [compilation/backends.py:1111] Dynamo bytecode transform time: 3.60 s -(EngineCore pid=244) DEBUG 04-22 01:53:23 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=244) DEBUG 04-22 01:53:23 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=244) DEBUG 04-22 01:53:23 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=244) DEBUG 04-22 01:53:23 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(EngineCore pid=244) DEBUG 04-22 01:53:23 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 3.7 ms -(EngineCore pid=244) DEBUG 04-22 01:53:23 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=244) DEBUG 04-22 01:53:23 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=244) INFO 04-22 01:53:25 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=244) DEBUG 04-22 01:53:25 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/517af7d22e/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=244) DEBUG 04-22 01:53:26 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=244) DEBUG 04-22 01:53:26 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(EngineCore pid=244) DEBUG 04-22 01:53:26 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.3 ms -(EngineCore pid=244) DEBUG 04-22 01:53:26 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=244) DEBUG 04-22 01:53:26 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=244) DEBUG 04-22 01:53:27 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/517af7d22e/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=244) DEBUG 04-22 01:53:28 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=244) DEBUG 04-22 01:53:28 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.0 ms -(EngineCore pid=244) DEBUG 04-22 01:53:28 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms -(EngineCore pid=244) DEBUG 04-22 01:53:28 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=244) DEBUG 04-22 01:53:28 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(APIServer pid=1) DEBUG 04-22 01:53:28 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=244) DEBUG 04-22 01:53:28 [compilation/backends.py:377] Store the 28-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_28', '/data/.cache/vllm/torch_compile_cache/517af7d22e/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_28') -(EngineCore pid=244) INFO 04-22 01:53:28 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.20 s -(EngineCore pid=244) DEBUG 04-22 01:53:28 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/517af7d22e/rank_0_0/backbone/computation_graph.py -(EngineCore pid=244) INFO 04-22 01:53:29 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/af475fe7eb320dca54576c329c21f5832d9a8ff077be11492d01fe7d66d5502a/rank_0_0/model -(EngineCore pid=244) INFO 04-22 01:53:29 [compilation/monitor.py:48] torch.compile took 10.53 s in total -(EngineCore pid=244) INFO 04-22 01:53:30 [compilation/monitor.py:76] Initial profiling/warmup run took 0.62 s -(EngineCore pid=244) INFO 04-22 01:53:35 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=244) DEBUG 04-22 01:53:35 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=244) INFO 04-22 01:53:35 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=244) DEBUG 04-22 01:53:36 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 01:53:36 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 01:53:36 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-22 01:53:36 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 01:53:36 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 01:53:36 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-22 01:53:36 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 160.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(EngineCore pid=244) DEBUG 04-22 01:53:36 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 01:53:36 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 01:53:36 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-22 01:53:36 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 01:53:36 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 01:53:36 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-22 01:53:36 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 228.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(EngineCore pid=244) DEBUG 04-22 01:53:36 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=244) INFO 04-22 01:53:36 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.61 GiB total -(EngineCore pid=244) DEBUG 04-22 01:53:37 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=244) DEBUG 04-22 01:53:37 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.98 GiB (total), 59.53 GiB (within requested) -(EngineCore pid=244) DEBUG 04-22 01:53:37 [v1/worker/gpu_worker.py:435] Memory profiling takes 17.87 seconds. Total non KV cache memory: 16.7GiB; torch peak memory increase: 2.21GiB; non-torch forward increase memory: 0.24GiB; weights memory: 14.25GiB. -(EngineCore pid=244) INFO 04-22 01:53:37 [v1/worker/gpu_worker.py:436] Available KV cache memory: 58.53 GiB -(EngineCore pid=244) INFO 04-22 01:53:37 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9577 to maintain the same effective KV cache size. -(EngineCore pid=244) INFO 04-22 01:53:37 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 1,096,000 tokens -(EngineCore pid=244) INFO 04-22 01:53:37 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 32,768 tokens per request: 33.45x -(EngineCore pid=244) 2026-04-22 01:53:37,067 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=244) DEBUG 04-22 01:53:37 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) 2026-04-22 01:53:37,075 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=244) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:49:25 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:49:25 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 01:49:26 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:49:26 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 01:49:26 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 01:49:26 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model Qwen/Qwen2.5-7B-Instruct -(APIServer pid=1) INFO 04-22 01:49:26 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 01:49:26 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:49:26 [entrypoints/utils.py:233] non-default args: {'model_tag': 'Qwen/Qwen2.5-7B-Instruct', 'model': 'Qwen/Qwen2.5-7B-Instruct', 'max_model_len': 2048, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 01:49:26 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 01:49:26 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen2.Qwen2ForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 01:49:26 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0023413 secs -(APIServer pid=1) INFO 04-22 01:49:26 [config/model.py:549] Resolved architecture: Qwen2ForCausalLM -(APIServer pid=1) INFO 04-22 01:49:26 [config/model.py:1678] Using max model len 2048 -(APIServer pid=1) DEBUG 04-22 01:49:26 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 01:49:26 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 01:49:26 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 01:49:26 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 01:49:26 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 01:49:26 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 01:49:26 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 01:49:26 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 01:49:26 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:49:27 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:49:27 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 01:49:30 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:49:30 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:49:30 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:49:30 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:49:30 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:49:30 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:49:30 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:49:30 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:49:30 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:49:30 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:49:30 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:49:30 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:49:35 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=243) DEBUG 04-22 01:49:36 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 01:49:36 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=243) DEBUG 04-22 01:49:36 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/a61479ac-38ea-47e6-b2c3-71d688d66295'], outputs=['ipc:///tmp/815053a2-58ee-414b-8fcf-cb5d3eabb9a5'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=243) DEBUG 04-22 01:49:36 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=243) DEBUG 04-22 01:49:36 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=243) DEBUG 04-22 01:49:36 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=243) DEBUG 04-22 01:49:36 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=243) DEBUG 04-22 01:49:36 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=243) INFO 04-22 01:49:36 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='Qwen/Qwen2.5-7B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-7B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen2.5-7B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=243) DEBUG 04-22 01:49:37 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.39:33129 backend=nccl -(EngineCore pid=243) INFO 04-22 01:49:37 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.39:33129 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=243) DEBUG 04-22 01:49:37 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=243) INFO 04-22 01:49:37 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=243) DEBUG 04-22 01:49:37 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776822577.8015513, auto_measure=True -(EngineCore pid=243) DEBUG 04-22 01:49:37 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=243) DEBUG 04-22 01:49:37 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=243) DEBUG 04-22 01:49:37 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=243) DEBUG 04-22 01:49:37 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=243) DEBUG 04-22 01:49:37 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=243) DEBUG 04-22 01:49:37 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=243) DEBUG 04-22 01:49:37 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=243) DEBUG 04-22 01:49:37 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=243) INFO 04-22 01:49:38 [v1/worker/gpu_model_runner.py:4735] Starting to load model Qwen/Qwen2.5-7B-Instruct... -(EngineCore pid=243) DEBUG 04-22 01:49:38 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=243) INFO 04-22 01:49:38 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=243) INFO 04-22 01:49:38 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=243) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=243) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=243) DEBUG 04-22 01:49:38 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=243) DEBUG 04-22 01:49:38 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=243) DEBUG 04-22 01:49:38 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 57, 'silu_and_mul': 28, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=243) DEBUG 04-22 01:49:38 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=243) DEBUG 04-22 01:49:38 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00001-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00002-of-00004.safetensors', 'model-00003-of-00004.safetensors']] -(EngineCore pid=243) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=243) INFO 04-22 01:49:56 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/c1a43ecc55/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=a0c7a93070 comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/c1a43ecc55/rank_0_0/backbone -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/backends.py:1074] Vllm config hash: a0c7a93070 -(EngineCore pid=243) INFO 04-22 01:49:56 [compilation/backends.py:1111] Dynamo bytecode transform time: 3.53 s -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=243) DEBUG 04-22 01:49:56 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(APIServer pid=1) DEBUG 04-22 01:49:56 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=243) DEBUG 04-22 01:49:57 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 01:49:57 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms -(EngineCore pid=243) DEBUG 04-22 01:49:57 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.7 ms -(EngineCore pid=243) DEBUG 04-22 01:49:57 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 01:49:57 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=243) INFO 04-22 01:49:59 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=243) DEBUG 04-22 01:49:59 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/c1a43ecc55/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=243) DEBUG 04-22 01:49:59 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 01:49:59 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(EngineCore pid=243) DEBUG 04-22 01:49:59 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms -(EngineCore pid=243) DEBUG 04-22 01:49:59 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 01:49:59 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=243) DEBUG 04-22 01:50:00 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/c1a43ecc55/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=243) DEBUG 04-22 01:50:01 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 01:50:01 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms -(EngineCore pid=243) DEBUG 04-22 01:50:01 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms -(EngineCore pid=243) DEBUG 04-22 01:50:01 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 01:50:01 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(EngineCore pid=243) DEBUG 04-22 01:50:01 [compilation/backends.py:377] Store the 28-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_28', '/data/.cache/vllm/torch_compile_cache/c1a43ecc55/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_28') -(EngineCore pid=243) INFO 04-22 01:50:01 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.16 s -(EngineCore pid=243) DEBUG 04-22 01:50:02 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/c1a43ecc55/rank_0_0/backbone/computation_graph.py -(EngineCore pid=243) INFO 04-22 01:50:03 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/2b9e7b7e03218b45968c79fc3d87056325d0376cdeef027b60de5c4be6eeb4c9/rank_0_0/model -(EngineCore pid=243) INFO 04-22 01:50:03 [compilation/monitor.py:48] torch.compile took 10.32 s in total -(EngineCore pid=243) INFO 04-22 01:50:03 [compilation/monitor.py:76] Initial profiling/warmup run took 0.43 s -(APIServer pid=1) DEBUG 04-22 01:50:06 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=243) INFO 04-22 01:50:09 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=243) DEBUG 04-22 01:50:09 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=243) INFO 04-22 01:50:09 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=243) DEBUG 04-22 01:50:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:50:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:50:09 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 01:50:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:50:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:50:09 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 01:50:09 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 160.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(EngineCore pid=243) DEBUG 04-22 01:50:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:50:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:50:09 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 01:50:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:50:09 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:50:09 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 01:50:09 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 228.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(EngineCore pid=243) DEBUG 04-22 01:50:09 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=243) INFO 04-22 01:50:09 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.61 GiB total -(EngineCore pid=243) DEBUG 04-22 01:50:10 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=243) DEBUG 04-22 01:50:10 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.98 GiB (total), 59.53 GiB (within requested) -(EngineCore pid=243) DEBUG 04-22 01:50:10 [v1/worker/gpu_worker.py:435] Memory profiling takes 17.42 seconds. Total non KV cache memory: 16.7GiB; torch peak memory increase: 2.21GiB; non-torch forward increase memory: 0.24GiB; weights memory: 14.25GiB. -(EngineCore pid=243) INFO 04-22 01:50:10 [v1/worker/gpu_worker.py:436] Available KV cache memory: 58.53 GiB -(EngineCore pid=243) INFO 04-22 01:50:10 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9577 to maintain the same effective KV cache size. -(EngineCore pid=243) INFO 04-22 01:50:10 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 1,096,000 tokens -(EngineCore pid=243) INFO 04-22 01:50:10 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 2,048 tokens per request: 535.16x -(EngineCore pid=243) 2026-04-22 01:50:10,204 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=243) DEBUG 04-22 01:50:10 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) 2026-04-22 01:50:10,212 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=243) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:50:36 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:50:36 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 01:50:36 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:50:36 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 01:50:36 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 01:50:36 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model Qwen/Qwen2.5-7B-Instruct -(APIServer pid=1) INFO 04-22 01:50:36 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 01:50:36 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:50:36 [entrypoints/utils.py:233] non-default args: {'model_tag': 'Qwen/Qwen2.5-7B-Instruct', 'model': 'Qwen/Qwen2.5-7B-Instruct', 'max_model_len': 4096, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 01:50:36 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 01:50:37 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen2.Qwen2ForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 01:50:37 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0011555 secs -(APIServer pid=1) INFO 04-22 01:50:37 [config/model.py:549] Resolved architecture: Qwen2ForCausalLM -(APIServer pid=1) INFO 04-22 01:50:37 [config/model.py:1678] Using max model len 4096 -(APIServer pid=1) DEBUG 04-22 01:50:37 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 01:50:37 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 01:50:37 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 01:50:37 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 01:50:37 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 01:50:37 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 01:50:37 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 01:50:37 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 01:50:37 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:50:37 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:50:37 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 01:50:41 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:50:41 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:50:41 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:50:41 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:50:41 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:50:41 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:50:41 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:50:41 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:50:41 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:50:41 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:50:41 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:50:41 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:50:46 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=244) DEBUG 04-22 01:50:47 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 01:50:47 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=244) DEBUG 04-22 01:50:47 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/f918c33e-6b86-4f3c-9f2f-18041540ee46'], outputs=['ipc:///tmp/f05cb364-a727-4fa2-9e67-4ac7d5cd3acf'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=244) DEBUG 04-22 01:50:47 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=244) DEBUG 04-22 01:50:47 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=244) DEBUG 04-22 01:50:47 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=244) DEBUG 04-22 01:50:47 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=244) DEBUG 04-22 01:50:47 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=244) INFO 04-22 01:50:47 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='Qwen/Qwen2.5-7B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-7B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen2.5-7B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=244) DEBUG 04-22 01:50:48 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.40:53241 backend=nccl -(EngineCore pid=244) INFO 04-22 01:50:48 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.40:53241 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=244) DEBUG 04-22 01:50:48 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=244) INFO 04-22 01:50:48 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=244) DEBUG 04-22 01:50:48 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776822648.5352857, auto_measure=True -(EngineCore pid=244) DEBUG 04-22 01:50:48 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=244) DEBUG 04-22 01:50:48 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=244) DEBUG 04-22 01:50:48 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=244) DEBUG 04-22 01:50:48 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=244) DEBUG 04-22 01:50:48 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=244) DEBUG 04-22 01:50:48 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=244) DEBUG 04-22 01:50:48 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=244) DEBUG 04-22 01:50:48 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=244) INFO 04-22 01:50:48 [v1/worker/gpu_model_runner.py:4735] Starting to load model Qwen/Qwen2.5-7B-Instruct... -(EngineCore pid=244) DEBUG 04-22 01:50:49 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=244) INFO 04-22 01:50:49 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=244) INFO 04-22 01:50:49 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=244) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=244) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=244) DEBUG 04-22 01:50:49 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=244) DEBUG 04-22 01:50:49 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=244) DEBUG 04-22 01:50:49 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 57, 'silu_and_mul': 28, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=244) DEBUG 04-22 01:50:49 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=244) DEBUG 04-22 01:50:49 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00004-of-00004.safetensors', 'model-00002-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00001-of-00004.safetensors']] -(EngineCore pid=244) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 -(APIServer pid=1) DEBUG 04-22 01:50:57 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=244) INFO 04-22 01:50:59 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/e690b99c41/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=9f1e1a0fca comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/e690b99c41/rank_0_0/backbone -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=244) DEBUG 04-22 01:50:59 [compilation/backends.py:1074] Vllm config hash: 9f1e1a0fca -(EngineCore pid=244) INFO 04-22 01:50:59 [compilation/backends.py:1111] Dynamo bytecode transform time: 3.53 s -(EngineCore pid=244) DEBUG 04-22 01:51:00 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=244) DEBUG 04-22 01:51:00 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=244) DEBUG 04-22 01:51:00 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=244) DEBUG 04-22 01:51:00 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms -(EngineCore pid=244) DEBUG 04-22 01:51:00 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms -(EngineCore pid=244) DEBUG 04-22 01:51:00 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=244) DEBUG 04-22 01:51:00 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=244) INFO 04-22 01:51:02 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=244) DEBUG 04-22 01:51:02 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/e690b99c41/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=244) DEBUG 04-22 01:51:02 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=244) DEBUG 04-22 01:51:02 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(EngineCore pid=244) DEBUG 04-22 01:51:02 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms -(EngineCore pid=244) DEBUG 04-22 01:51:02 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=244) DEBUG 04-22 01:51:02 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=244) DEBUG 04-22 01:51:04 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/e690b99c41/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=244) DEBUG 04-22 01:51:04 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=244) DEBUG 04-22 01:51:04 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms -(EngineCore pid=244) DEBUG 04-22 01:51:04 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms -(EngineCore pid=244) DEBUG 04-22 01:51:04 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=244) DEBUG 04-22 01:51:04 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(EngineCore pid=244) DEBUG 04-22 01:51:05 [compilation/backends.py:377] Store the 28-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_28', '/data/.cache/vllm/torch_compile_cache/e690b99c41/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_28') -(EngineCore pid=244) INFO 04-22 01:51:05 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 5.10 s -(EngineCore pid=244) DEBUG 04-22 01:51:05 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/e690b99c41/rank_0_0/backbone/computation_graph.py -(EngineCore pid=244) INFO 04-22 01:51:06 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/48014cda5f34d6b24aefcc31b0f02d5d7bedc9855e095468b3e5a530b56177cf/rank_0_0/model -(EngineCore pid=244) INFO 04-22 01:51:06 [compilation/monitor.py:48] torch.compile took 10.13 s in total -(EngineCore pid=244) INFO 04-22 01:51:06 [compilation/monitor.py:76] Initial profiling/warmup run took 0.43 s -(APIServer pid=1) DEBUG 04-22 01:51:07 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=244) INFO 04-22 01:51:12 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=244) DEBUG 04-22 01:51:12 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=244) INFO 04-22 01:51:12 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=244) DEBUG 04-22 01:51:12 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 01:51:12 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 01:51:12 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-22 01:51:12 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 01:51:12 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 01:51:12 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-22 01:51:12 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 160.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(EngineCore pid=244) DEBUG 04-22 01:51:12 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 01:51:12 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 01:51:12 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-22 01:51:12 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 01:51:12 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 01:51:12 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-22 01:51:12 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 228.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(EngineCore pid=244) DEBUG 04-22 01:51:13 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=244) INFO 04-22 01:51:13 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.61 GiB total -(EngineCore pid=244) DEBUG 04-22 01:51:13 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=244) DEBUG 04-22 01:51:13 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.98 GiB (total), 59.53 GiB (within requested) -(EngineCore pid=244) DEBUG 04-22 01:51:13 [v1/worker/gpu_worker.py:435] Memory profiling takes 16.99 seconds. Total non KV cache memory: 16.7GiB; torch peak memory increase: 2.21GiB; non-torch forward increase memory: 0.24GiB; weights memory: 14.25GiB. -(EngineCore pid=244) INFO 04-22 01:51:13 [v1/worker/gpu_worker.py:436] Available KV cache memory: 58.53 GiB -(EngineCore pid=244) INFO 04-22 01:51:13 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9577 to maintain the same effective KV cache size. -(EngineCore pid=244) INFO 04-22 01:51:13 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 1,096,000 tokens -(EngineCore pid=244) INFO 04-22 01:51:13 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 4,096 tokens per request: 267.58x -(EngineCore pid=244) 2026-04-22 01:51:13,316 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=244) DEBUG 04-22 01:51:13 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) 2026-04-22 01:51:13,324 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=244) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:00:18 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:00:18 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 01:00:19 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:00:19 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 01:00:19 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 01:00:19 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model Qwen/Qwen2.5-7B-Instruct -(APIServer pid=1) INFO 04-22 01:00:19 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 01:00:19 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:00:19 [entrypoints/utils.py:233] non-default args: {'model_tag': 'Qwen/Qwen2.5-7B-Instruct', 'model': 'Qwen/Qwen2.5-7B-Instruct', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 01:00:19 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 01:00:19 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen2.Qwen2ForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 01:00:19 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0004101 secs -(APIServer pid=1) INFO 04-22 01:00:19 [config/model.py:549] Resolved architecture: Qwen2ForCausalLM -(APIServer pid=1) INFO 04-22 01:00:19 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 01:00:19 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 01:00:19 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 01:00:19 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 01:00:19 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 01:00:19 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 01:00:19 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 01:00:19 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 01:00:19 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 01:00:19 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:00:19 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:00:19 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 01:00:23 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:00:23 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:00:23 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:00:23 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:00:23 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:00:23 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:00:23 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:00:23 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:00:23 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:00:23 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:00:23 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:00:23 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:00:28 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=242) DEBUG 04-22 01:00:29 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 01:00:29 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=242) DEBUG 04-22 01:00:29 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/70fbe184-2f30-4d0d-80e0-3c9f4a4363a7'], outputs=['ipc:///tmp/3deaa7ca-dacc-495c-bff0-288f35c7b679'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=242) DEBUG 04-22 01:00:29 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=242) DEBUG 04-22 01:00:29 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=242) DEBUG 04-22 01:00:29 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=242) DEBUG 04-22 01:00:29 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=242) DEBUG 04-22 01:00:29 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=242) INFO 04-22 01:00:29 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='Qwen/Qwen2.5-7B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-7B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen2.5-7B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=242) DEBUG 04-22 01:00:30 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.131.7.84:48247 backend=nccl -(EngineCore pid=242) INFO 04-22 01:00:30 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.131.7.84:48247 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=242) DEBUG 04-22 01:00:30 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=242) INFO 04-22 01:00:30 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=242) DEBUG 04-22 01:00:30 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776819630.9128685, auto_measure=True -(EngineCore pid=242) DEBUG 04-22 01:00:30 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=242) DEBUG 04-22 01:00:31 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=242) DEBUG 04-22 01:00:31 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=242) DEBUG 04-22 01:00:31 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=242) DEBUG 04-22 01:00:31 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=242) DEBUG 04-22 01:00:31 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=242) DEBUG 04-22 01:00:31 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=242) DEBUG 04-22 01:00:31 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=242) INFO 04-22 01:00:31 [v1/worker/gpu_model_runner.py:4735] Starting to load model Qwen/Qwen2.5-7B-Instruct... -(EngineCore pid=242) DEBUG 04-22 01:00:31 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=242) INFO 04-22 01:00:31 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=242) INFO 04-22 01:00:31 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=242) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=242) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=242) DEBUG 04-22 01:00:31 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=242) DEBUG 04-22 01:00:31 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=242) DEBUG 04-22 01:00:31 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 57, 'silu_and_mul': 28, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=242) DEBUG 04-22 01:00:31 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=242) DEBUG 04-22 01:00:32 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00001-of-00004.safetensors']] -(EngineCore pid=242) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:38:34 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:38:34 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 01:38:34 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:38:34 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 01:38:34 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 01:38:34 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model Qwen/Qwen2.5-7B-Instruct -(APIServer pid=1) INFO 04-22 01:38:34 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 01:38:34 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:38:34 [entrypoints/utils.py:233] non-default args: {'model_tag': 'Qwen/Qwen2.5-7B-Instruct', 'model': 'Qwen/Qwen2.5-7B-Instruct', 'max_model_len': 8192, 'tensor_parallel_size': 2, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 01:38:34 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 01:38:35 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen2.Qwen2ForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 01:38:35 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0025655 secs -(APIServer pid=1) INFO 04-22 01:38:35 [config/model.py:549] Resolved architecture: Qwen2ForCausalLM -(APIServer pid=1) INFO 04-22 01:38:35 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 01:38:35 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 01:38:35 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 01:38:35 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 01:38:35 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 01:38:35 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 01:38:35 [config/parallel.py:743] Defaulting to use mp for distributed inference -(APIServer pid=1) INFO 04-22 01:38:35 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 01:38:35 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(APIServer pid=1) DEBUG 04-22 01:38:37 [config/vllm.py:1530] Max num batched tokens below allreduce-rms fusion threshold, allreduce-rms fusion will be enabled for all num_tokens. -(APIServer pid=1) INFO 04-22 01:38:37 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(APIServer pid=1) DEBUG 04-22 01:38:37 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 01:38:37 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:38:38 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:38:38 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 01:38:41 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:38:41 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:38:41 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:38:41 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:38:41 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:38:41 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:38:41 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:38:41 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:38:41 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:38:41 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:38:41 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:38:41 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:38:46 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=242) DEBUG 04-22 01:38:48 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 01:38:48 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=242) DEBUG 04-22 01:38:48 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/a9aaebbc-67b3-48c3-bb08-891f4fe6d87b'], outputs=['ipc:///tmp/7d27bee5-a979-4651-a995-18123120cd09'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=242) DEBUG 04-22 01:38:48 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=242) DEBUG 04-22 01:38:48 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=242) DEBUG 04-22 01:38:48 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=242) DEBUG 04-22 01:38:48 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=242) DEBUG 04-22 01:38:48 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=242) INFO 04-22 01:38:48 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='Qwen/Qwen2.5-7B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-7B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen2.5-7B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=242) WARNING 04-22 01:38:48 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. -(EngineCore pid=242) INFO 04-22 01:38:48 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.129.9.33 (local), world_size=2, local_world_size=2 -(EngineCore pid=242) DEBUG 04-22 01:38:48 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/5d89bb73-e1b7-4ead-af9d-0d069eb0cbe8 -(EngineCore pid=242) DEBUG 04-22 01:38:48 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1], buffer_handle=(2, 16777216, 10, 'psm_1b7e4d47'), local_subscribe_addr='ipc:///tmp/5d89bb73-e1b7-4ead-af9d-0d069eb0cbe8', local_notify_addr='ipc:///tmp/17f21fec-15b6-4793-af53-6da7e4ec48d3', remote_subscribe_addr=None, remote_addr_ipv6=False) -DEBUG 04-22 01:38:51 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:38:51 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:38:51 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:38:51 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:38:51 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:38:51 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:38:51 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:38:51 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:38:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:38:51 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:38:51 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:38:51 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:38:51 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:38:51 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:38:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:38:51 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:38:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:38:51 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:38:51 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:38:51 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:38:51 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:38:51 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:38:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:38:51 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:38:56 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:38:56 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:38:57 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:38:57 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:38:57 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:38:57 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 01:38:58 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:38:58 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:38:58 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:38:58 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) DEBUG 04-22 01:38:58 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker pid=441) DEBUG 04-22 01:38:58 [distributed/parallel_state.py:1356] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:40119 backend=nccl -(Worker pid=441) INFO 04-22 01:38:58 [distributed/parallel_state.py:1400] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:40119 backend=nccl -(Worker pid=442) DEBUG 04-22 01:38:58 [distributed/parallel_state.py:1356] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:40119 backend=nccl -(Worker pid=442) INFO 04-22 01:38:58 [distributed/parallel_state.py:1400] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:40119 backend=nccl -[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -(Worker pid=442) DEBUG 04-22 01:38:58 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=441) DEBUG 04-22 01:38:58 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -(Worker pid=441) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=442) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=441) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=442) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=441) DEBUG 04-22 01:38:58 [utils/nccl.py:34] Found nccl from library libnccl.so.2 -(Worker pid=441) INFO 04-22 01:38:58 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 -(Worker pid=442) DEBUG 04-22 01:38:59 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=441) DEBUG 04-22 01:38:59 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=441) DEBUG 04-22 01:38:59 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/9635c6e6-7573-4cb8-ba15-e9e4de69f58e -(Worker pid=441) DEBUG 04-22 01:38:59 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1], buffer_handle=(1, 4194304, 6, 'psm_233a6472'), local_subscribe_addr='ipc:///tmp/9635c6e6-7573-4cb8-ba15-e9e4de69f58e', local_notify_addr='ipc:///tmp/d4903846-6b2f-4ebf-8b4a-4838bed61d2e', remote_subscribe_addr=None, remote_addr_ipv6=False) -(Worker pid=442) DEBUG 04-22 01:38:59 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/9635c6e6-7573-4cb8-ba15-e9e4de69f58e -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(Worker pid=441) INFO 04-22 01:38:59 [distributed/parallel_state.py:1716] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(Worker pid=441) DEBUG 04-22 01:38:59 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.66GiB, total_memory=79.19GiB, cuda_memory=1.53GiB, torch_memory=0.02GiB, non_torch_memory=1.51GiB, timestamp=1776821939.8407264, auto_measure=True -(Worker pid=441) DEBUG 04-22 01:38:59 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=442) DEBUG 04-22 01:38:59 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.66GiB, total_memory=79.19GiB, cuda_memory=1.53GiB, torch_memory=0.02GiB, non_torch_memory=1.51GiB, timestamp=1776821939.8653443, auto_measure=True -(Worker pid=442) DEBUG 04-22 01:38:59 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=441) DEBUG 04-22 01:38:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=441) DEBUG 04-22 01:38:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=441) DEBUG 04-22 01:38:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=442) DEBUG 04-22 01:38:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=442) DEBUG 04-22 01:38:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=442) DEBUG 04-22 01:38:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=441) DEBUG 04-22 01:38:59 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=441) DEBUG 04-22 01:38:59 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(Worker pid=441) DEBUG 04-22 01:39:00 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=442) DEBUG 04-22 01:39:00 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=442) DEBUG 04-22 01:39:00 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=441) DEBUG 04-22 01:39:00 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(Worker_TP0 pid=441) INFO 04-22 01:39:00 [v1/worker/gpu_model_runner.py:4735] Starting to load model Qwen/Qwen2.5-7B-Instruct... -(Worker_TP0 pid=441) DEBUG 04-22 01:39:00 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(Worker_TP0 pid=441) INFO 04-22 01:39:00 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(Worker_TP0 pid=441) INFO 04-22 01:39:00 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(Worker_TP0 pid=441) DEBUG 04-22 01:39:00 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP0 pid=441) DEBUG 04-22 01:39:00 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP0 pid=441) DEBUG 04-22 01:39:00 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 57, 'silu_and_mul': 28, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP0 pid=441) DEBUG 04-22 01:39:00 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP1 pid=442) DEBUG 04-22 01:39:00 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP1 pid=442) DEBUG 04-22 01:39:00 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP1 pid=442) DEBUG 04-22 01:39:00 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 57, 'silu_and_mul': 28, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP1 pid=442) DEBUG 04-22 01:39:00 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP1 pid=442) DEBUG 04-22 01:39:00 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00003-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'model-00002-of-00004.safetensors']] -(Worker_TP0 pid=441) DEBUG 04-22 01:39:00 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00004-of-00004.safetensors', 'model-00002-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00001-of-00004.safetensors']] -(Worker_TP0 pid=441) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 -(Worker_TP1 pid=442) DEBUG 04-22 01:39:12 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=442) DEBUG 04-22 01:39:12 [compilation/decorators.py:528] Start compiling function -(EngineCore pid=242) DEBUG 04-22 01:39:13 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=eb5300d232 comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/2a701ed6c9/rank_1_0/backbone -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] Vllm config hash: eb5300d232 -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP0 pid=441) INFO 04-22 01:39:16 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/2a701ed6c9/rank_0_0/backbone for vLLM's torch.compile -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=eb5300d232 comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/2a701ed6c9/rank_0_0/backbone -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/backends.py:1074] Vllm config hash: eb5300d232 -(Worker_TP0 pid=441) INFO 04-22 01:39:16 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.20 s -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [compilation/.../fusion/allreduce_rms_fusion.py:765] Flashinfer max size: 64 MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: 8192 -(Worker_TP0 pid=441) INFO 04-22 01:39:16 [distributed/device_communicators/flashinfer_all_reduce.py:109] Auto-selected flashinfer allreduce backend: trtllm -(Worker_TP0 pid=441) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. -(Worker_TP0 pid=441) return func(*args, **kwargs) -(Worker_TP0 pid=441) DEBUG 04-22 01:39:16 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=0, max_token_num=8192, hidden_dim=3584, dtype=torch.bfloat16 -(Worker_TP1 pid=442) DEBUG 04-22 01:39:16 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=1, max_token_num=8192, hidden_dim=3584, dtype=torch.bfloat16 -(Worker_TP0 pid=441) INFO 04-22 01:39:16 [distributed/device_communicators/flashinfer_all_reduce.py:149] Initialized FlashInfer Allreduce norm fusion workspace with backend=trtllm -(Worker_TP0 pid=441) DEBUG 04-22 01:39:17 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(Worker_TP0 pid=441) DEBUG 04-22 01:39:17 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(Worker_TP1 pid=442) DEBUG 04-22 01:39:17 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=442) DEBUG 04-22 01:39:17 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP0 pid=441) DEBUG 04-22 01:39:17 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=441) DEBUG 04-22 01:39:17 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP1 pid=442) DEBUG 04-22 01:39:17 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns -(Worker_TP1 pid=442) DEBUG 04-22 01:39:17 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 38.0 ms -(Worker_TP1 pid=442) DEBUG 04-22 01:39:17 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=442) DEBUG 04-22 01:39:17 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes -(Worker_TP1 pid=442) DEBUG 04-22 01:39:17 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=441) DEBUG 04-22 01:39:17 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns -(Worker_TP0 pid=441) DEBUG 04-22 01:39:17 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 38.5 ms -(Worker_TP0 pid=441) DEBUG 04-22 01:39:17 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=441) DEBUG 04-22 01:39:17 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes -(Worker_TP0 pid=441) DEBUG 04-22 01:39:17 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(APIServer pid=1) DEBUG 04-22 01:39:18 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=441) INFO 04-22 01:39:20 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(Worker_TP0 pid=441) DEBUG 04-22 01:39:20 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/2a701ed6c9/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(Worker_TP1 pid=442) DEBUG 04-22 01:39:20 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=442) DEBUG 04-22 01:39:20 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP1 pid=442) DEBUG 04-22 01:39:20 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP1 pid=442) DEBUG 04-22 01:39:20 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 66.5 ms -(Worker_TP1 pid=442) DEBUG 04-22 01:39:20 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=442) DEBUG 04-22 01:39:20 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP1 pid=442) DEBUG 04-22 01:39:20 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP0 pid=441) DEBUG 04-22 01:39:21 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=441) DEBUG 04-22 01:39:21 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP0 pid=441) DEBUG 04-22 01:39:21 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP0 pid=441) DEBUG 04-22 01:39:21 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 63.9 ms -(Worker_TP0 pid=441) DEBUG 04-22 01:39:21 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=441) DEBUG 04-22 01:39:21 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP0 pid=441) DEBUG 04-22 01:39:21 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP0 pid=441) DEBUG 04-22 01:39:21 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/2a701ed6c9/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(Worker_TP1 pid=442) DEBUG 04-22 01:39:22 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=442) DEBUG 04-22 01:39:22 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP1 pid=442) DEBUG 04-22 01:39:22 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP1 pid=442) DEBUG 04-22 01:39:22 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 63.3 ms -(Worker_TP1 pid=442) DEBUG 04-22 01:39:22 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP1 pid=442) DEBUG 04-22 01:39:22 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP1 pid=442) DEBUG 04-22 01:39:22 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=441) DEBUG 04-22 01:39:22 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=441) DEBUG 04-22 01:39:22 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP0 pid=441) DEBUG 04-22 01:39:23 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP0 pid=441) DEBUG 04-22 01:39:23 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 479.4 ms -(Worker_TP0 pid=441) DEBUG 04-22 01:39:23 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP0 pid=441) DEBUG 04-22 01:39:23 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP0 pid=441) DEBUG 04-22 01:39:23 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=441) DEBUG 04-22 01:39:23 [compilation/backends.py:377] Store the 28-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_28', '/data/.cache/vllm/torch_compile_cache/2a701ed6c9/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_28') -(Worker_TP0 pid=441) INFO 04-22 01:39:23 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 6.23 s -(Worker_TP0 pid=441) DEBUG 04-22 01:39:23 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/2a701ed6c9/rank_0_0/backbone/computation_graph.py -(Worker_TP0 pid=441) INFO 04-22 01:39:24 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/aef92f39fb2a63603cb9c85df96429df4227a73756dec1d4158c61e06d29d35f/rank_0_0/model -(Worker_TP0 pid=441) INFO 04-22 01:39:24 [compilation/monitor.py:48] torch.compile took 11.64 s in total -(Worker_TP0 pid=441) INFO 04-22 01:39:24 [compilation/monitor.py:76] Initial profiling/warmup run took 0.18 s -(APIServer pid=1) DEBUG 04-22 01:39:28 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP1 pid=442) INFO 04-22 01:39:30 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP1 pid=442) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP1 pid=442) INFO 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_TP0 pid=441) INFO 04-22 01:39:30 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP0 pid=441) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP0 pid=441) INFO 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_TP1 pid=442) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=441) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=442) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=442) DEBUG 04-22 01:39:30 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=441) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=441) DEBUG 04-22 01:39:30 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=442) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=441) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=442) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=442) DEBUG 04-22 01:39:30 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=441) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=441) DEBUG 04-22 01:39:30 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=442) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 122.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(Worker_TP1 pid=442) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=441) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 122.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(Worker_TP0 pid=441) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=442) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=442) DEBUG 04-22 01:39:30 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=441) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=441) DEBUG 04-22 01:39:30 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=441) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=442) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=441) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=441) DEBUG 04-22 01:39:30 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=442) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=442) DEBUG 04-22 01:39:30 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=441) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 116.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(Worker_TP0 pid=441) INFO 04-22 01:39:30 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses -(Worker_TP1 pid=442) DEBUG 04-22 01:39:30 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 116.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(Worker_TP1 pid=442) INFO 04-22 01:39:30 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses -(Worker_TP0 pid=441) DEBUG 04-22 01:39:31 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP0 pid=441) INFO 04-22 01:39:31 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.61 GiB total -(Worker_TP1 pid=442) DEBUG 04-22 01:39:31 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP1 pid=442) INFO 04-22 01:39:31 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.61 GiB total -(Worker_TP0 pid=441) DEBUG 04-22 01:39:31 [v1/worker/gpu_worker.py:424] Initial free memory: 77.66 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP0 pid=441) DEBUG 04-22 01:39:31 [v1/worker/gpu_worker.py:430] Free memory after profiling: 68.1 GiB (total), 65.68 GiB (within requested) -(Worker_TP0 pid=441) DEBUG 04-22 01:39:31 [v1/worker/gpu_worker.py:435] Memory profiling takes 18.94 seconds. Total non KV cache memory: 11.39GiB; torch peak memory increase: 2.21GiB; non-torch forward increase memory: 2.06GiB; weights memory: 7.12GiB. -(Worker_TP0 pid=441) INFO 04-22 01:39:31 [v1/worker/gpu_worker.py:436] Available KV cache memory: 63.84 GiB -(Worker_TP0 pid=441) INFO 04-22 01:39:31 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9577 to maintain the same effective KV cache size. -(Worker_TP0 pid=441) DEBUG 04-22 01:39:31 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=242) DEBUG 04-22 01:39:31 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=242) DEBUG 04-22 01:39:31 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=442) DEBUG 04-22 01:39:31 [v1/worker/gpu_worker.py:424] Initial free memory: 77.66 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP1 pid=442) DEBUG 04-22 01:39:31 [v1/worker/gpu_worker.py:430] Free memory after profiling: 68.1 GiB (total), 65.68 GiB (within requested) -(Worker_TP1 pid=442) DEBUG 04-22 01:39:31 [v1/worker/gpu_worker.py:435] Memory profiling takes 18.93 seconds. Total non KV cache memory: 11.39GiB; torch peak memory increase: 2.21GiB; non-torch forward increase memory: 2.06GiB; weights memory: 7.12GiB. -(Worker_TP1 pid=442) INFO 04-22 01:39:31 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9577 to maintain the same effective KV cache size. -(Worker_TP1 pid=442) DEBUG 04-22 01:39:31 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=242) DEBUG 04-22 01:39:31 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=242) INFO 04-22 01:39:31 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 2,390,592 tokens -(EngineCore pid=242) INFO 04-22 01:39:31 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 291.82x -(Worker_TP1 pid=442) DEBUG 04-22 01:39:31 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=441) DEBUG 04-22 01:39:31 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=442) 2026-04-22 01:39:31,598 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP0 pid=441) 2026-04-22 01:39:31,599 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP1 pid=442) DEBUG 04-22 01:39:31 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=441) DEBUG 04-22 01:39:31 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=442) 2026-04-22 01:39:31,612 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP0 pid=441) 2026-04-22 01:39:31,613 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP1 pid=442) DEBUG 04-22 01:39:31 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=442) DEBUG 04-22 01:39:31 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=442) DEBUG 04-22 01:39:31 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=441) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=242) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=242) DEBUG 04-22 01:39:36 [config/vllm.py:1530] Max num batched tokens below allreduce-rms fusion threshold, allreduce-rms fusion will be enabled for all num_tokens. -(EngineCore pid=242) INFO 04-22 01:39:36 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(Worker_TP0 pid=441) DEBUG 04-22 01:39:36 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=442) DEBUG 04-22 01:39:36 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=242) DEBUG 04-22 01:39:36 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=242) DEBUG 04-22 01:39:36 [v1/engine/core.py:1158] EngineCore waiting for work. -(EngineCore pid=242) DEBUG 04-22 01:39:36 [v1/engine/core.py:1158] EngineCore waiting for work. -(APIServer pid=1) INFO 04-22 01:39:36 [entrypoints/openai/api_server.py:590] Supported tasks: ['generate'] -(APIServer pid=1) WARNING 04-22 01:39:37 [config/model.py:1435] Default vLLM sampling parameters have been overridden by the model's `generation_config.json`: `{'repetition_penalty': 1.05, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8}`. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`. -(APIServer pid=1) DEBUG 04-22 01:39:37 [renderers/base.py:197] Warming up chat template processing... -(APIServer pid=1) DEBUG 04-22 01:39:37 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e826d9-2c80016f419c8b023123a696;b016afaf-c2fd-411a-aa52-9e6efd66e7ec) -(APIServer pid=1) DEBUG 04-22 01:39:37 [transformers_utils/repo_utils.py:243] -(APIServer pid=1) DEBUG 04-22 01:39:37 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/resolve/main/processor_config.json. -(APIServer pid=1) DEBUG 04-22 01:39:37 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e826d9-30b1e4fc4eae13e100557232;38380ba9-6fd5-4d55-b45f-28872cddb86b) -(APIServer pid=1) DEBUG 04-22 01:39:37 [transformers_utils/repo_utils.py:243] -(APIServer pid=1) DEBUG 04-22 01:39:37 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/resolve/main/preprocessor_config.json. -(Worker_TP1 pid=442) DEBUG 04-22 01:39:37 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=441) DEBUG 04-22 01:39:37 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(APIServer pid=1) INFO 04-22 01:39:38 [renderers/hf.py:314] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this. -(APIServer pid=1) DEBUG 04-22 01:39:38 [renderers/base.py:203] Chat template warmup completed in 0.762s -(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/openai/api_server.py:594] Starting vLLM server on http://0.0.0.0:8000 -(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:37] Available routes are: -(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /openapi.json, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /docs, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /docs/oauth2-redirect, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /redoc, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /tokenize, Methods: POST -(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /detokenize, Methods: POST -(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /load, Methods: GET -(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /version, Methods: GET -(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /health, Methods: GET -(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /metrics, Methods: GET -(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /v1/models, Methods: GET -(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /ping, Methods: GET -(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /ping, Methods: POST -(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /invocations, Methods: POST -(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /v1/chat/completions, Methods: POST -(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST -(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /v1/responses, Methods: POST -(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET -(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST -(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /v1/completions, Methods: POST -(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /v1/messages, Methods: POST -(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST -(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /inference/v1/generate, Methods: POST -(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /scale_elastic_ep, Methods: POST -(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST -(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /v1/chat/completions/render, Methods: POST -(APIServer pid=1) INFO 04-22 01:39:38 [entrypoints/launcher.py:46] Route: /v1/completions/render, Methods: POST -(APIServer pid=1) INFO: Started server process [1] -(APIServer pid=1) INFO: Waiting for application startup. -(APIServer pid=1) INFO: Application startup complete. -(APIServer pid=1) DEBUG 04-22 01:39:43 [v1/engine/async_llm.py:875] Called check_health. -(APIServer pid=1) INFO: 10.129.8.2:39450 - "GET /health HTTP/1.1" 200 OK diff --git a/accuracy/results/v0.19.0/logs/qwen-qwen2-5-7b-instruct--h100-80gb--tp4pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/qwen-qwen2-5-7b-instruct--h100-80gb--tp4pp1dp1--8192.log deleted file mode 100644 index 884fc6de..00000000 --- a/accuracy/results/v0.19.0/logs/qwen-qwen2-5-7b-instruct--h100-80gb--tp4pp1dp1--8192.log +++ /dev/null @@ -1,2766 +0,0 @@ -DEBUG 04-22 01:39:53 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:39:53 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:39:53 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:39:53 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:39:53 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:39:53 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:39:53 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:39:53 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:39:53 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:39:53 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:39:53 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:39:53 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:39:58 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:40:00 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' -DEBUG 04-22 01:40:00 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:40:00 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:40:00 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:40:00 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 01:40:00 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:40:00 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 01:40:00 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 01:40:00 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model Qwen/Qwen2.5-7B-Instruct -(APIServer pid=1) INFO 04-22 01:40:00 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 01:40:00 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:40:00 [entrypoints/utils.py:233] non-default args: {'model_tag': 'Qwen/Qwen2.5-7B-Instruct', 'model': 'Qwen/Qwen2.5-7B-Instruct', 'max_model_len': 8192, 'tensor_parallel_size': 4, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 01:40:00 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 01:40:00 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen2.Qwen2ForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 01:40:00 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0004032 secs -(APIServer pid=1) INFO 04-22 01:40:00 [config/model.py:549] Resolved architecture: Qwen2ForCausalLM -(APIServer pid=1) INFO 04-22 01:40:00 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 01:40:00 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 01:40:00 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 01:40:00 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 01:40:00 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 01:40:00 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 01:40:00 [config/parallel.py:743] Defaulting to use mp for distributed inference -(APIServer pid=1) INFO 04-22 01:40:00 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 01:40:00 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(APIServer pid=1) INFO 04-22 01:40:02 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(APIServer pid=1) DEBUG 04-22 01:40:02 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 01:40:02 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:40:02 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:40:02 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 01:40:06 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:40:06 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:40:06 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:40:06 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:40:06 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:40:06 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:40:06 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:40:06 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:40:06 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:40:06 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:40:06 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:40:06 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:40:11 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=245) DEBUG 04-22 01:40:12 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 01:40:12 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=245) DEBUG 04-22 01:40:12 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/ec127100-6ce5-4395-a73b-d83623525879'], outputs=['ipc:///tmp/86065b28-fa27-42f9-9a3a-eec26e577961'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=245) DEBUG 04-22 01:40:12 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=245) DEBUG 04-22 01:40:12 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=245) DEBUG 04-22 01:40:12 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=245) DEBUG 04-22 01:40:12 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=245) DEBUG 04-22 01:40:12 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=245) INFO 04-22 01:40:12 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='Qwen/Qwen2.5-7B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-7B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=4, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen2.5-7B-Instruct, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [292, 8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=245) WARNING 04-22 01:40:12 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. -(EngineCore pid=245) INFO 04-22 01:40:12 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.131.3.124 (local), world_size=4, local_world_size=4 -(EngineCore pid=245) DEBUG 04-22 01:40:12 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/cb21d1be-33a2-4133-82b9-d034295f0bd6 -(EngineCore pid=245) DEBUG 04-22 01:40:12 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1, 2, 3], buffer_handle=(4, 16777216, 10, 'psm_c6d422d2'), local_subscribe_addr='ipc:///tmp/cb21d1be-33a2-4133-82b9-d034295f0bd6', local_notify_addr='ipc:///tmp/c67727b1-0e20-4e5c-8290-cb9d665888f0', remote_subscribe_addr=None, remote_addr_ipv6=False) -DEBUG 04-22 01:40:16 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:40:16 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:40:16 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:40:16 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:40:16 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:40:16 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:40:16 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:40:16 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:40:16 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:40:16 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:40:16 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:40:16 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:40:16 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:40:16 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:40:16 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:40:16 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:40:16 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:40:16 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:40:16 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:40:16 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:40:16 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:40:16 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:40:16 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:40:16 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:40:16 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:40:16 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:40:16 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:40:16 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:40:16 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:40:16 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:40:16 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:40:16 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:40:16 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:40:16 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:40:16 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:40:16 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:40:16 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:40:16 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:40:16 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:40:16 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:40:16 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:40:16 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:40:16 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:40:16 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:40:16 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:40:16 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:40:16 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:40:16 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:40:21 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:40:21 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:40:21 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:40:21 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:40:22 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:40:22 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:40:22 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:40:22 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) DEBUG 04-22 01:40:22 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -DEBUG 04-22 01:40:22 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:40:22 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:40:22 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:40:22 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 01:40:22 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:40:22 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:40:22 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:40:22 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 01:40:22 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:40:22 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:40:22 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:40:22 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(Worker pid=444) DEBUG 04-22 01:40:23 [distributed/parallel_state.py:1356] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:50769 backend=nccl -(Worker pid=444) INFO 04-22 01:40:23 [distributed/parallel_state.py:1400] world_size=4 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:50769 backend=nccl -(Worker pid=447) DEBUG 04-22 01:40:23 [distributed/parallel_state.py:1356] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:50769 backend=nccl -(Worker pid=447) INFO 04-22 01:40:23 [distributed/parallel_state.py:1400] world_size=4 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:50769 backend=nccl -(Worker pid=446) DEBUG 04-22 01:40:24 [distributed/parallel_state.py:1356] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:50769 backend=nccl -(Worker pid=446) INFO 04-22 01:40:24 [distributed/parallel_state.py:1400] world_size=4 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:50769 backend=nccl -(Worker pid=445) DEBUG 04-22 01:40:24 [distributed/parallel_state.py:1356] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:50769 backend=nccl -(Worker pid=445) INFO 04-22 01:40:24 [distributed/parallel_state.py:1400] world_size=4 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:50769 backend=nccl -[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -(Worker pid=444) DEBUG 04-22 01:40:24 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=447) DEBUG 04-22 01:40:24 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=446) DEBUG 04-22 01:40:24 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=445) DEBUG 04-22 01:40:24 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3 -(Worker pid=444) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=444) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=447) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=447) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=445) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=445) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=446) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=446) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=444) DEBUG 04-22 01:40:24 [utils/nccl.py:34] Found nccl from library libnccl.so.2 -(Worker pid=444) INFO 04-22 01:40:24 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 -(Worker pid=445) DEBUG 04-22 01:40:25 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=446) DEBUG 04-22 01:40:25 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=444) DEBUG 04-22 01:40:25 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=447) DEBUG 04-22 01:40:25 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=444) DEBUG 04-22 01:40:25 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/567daeca-6b5c-4b71-b54f-b51254ee4b10 -(Worker pid=444) DEBUG 04-22 01:40:25 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1, 2, 3], buffer_handle=(3, 4194304, 6, 'psm_14064846'), local_subscribe_addr='ipc:///tmp/567daeca-6b5c-4b71-b54f-b51254ee4b10', local_notify_addr='ipc:///tmp/008725ac-f2cf-41d2-a95c-2f8274b2d1fd', remote_subscribe_addr=None, remote_addr_ipv6=False) -(Worker pid=446) DEBUG 04-22 01:40:25 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/567daeca-6b5c-4b71-b54f-b51254ee4b10 -(Worker pid=445) DEBUG 04-22 01:40:25 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/567daeca-6b5c-4b71-b54f-b51254ee4b10 -(Worker pid=447) DEBUG 04-22 01:40:25 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/567daeca-6b5c-4b71-b54f-b51254ee4b10 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(Worker pid=444) INFO 04-22 01:40:25 [distributed/parallel_state.py:1716] rank 0 in world size 4 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(Worker pid=444) DEBUG 04-22 01:40:25 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776822025.549855, auto_measure=True -(Worker pid=444) DEBUG 04-22 01:40:25 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=447) DEBUG 04-22 01:40:25 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776822025.587866, auto_measure=True -(Worker pid=447) DEBUG 04-22 01:40:25 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=446) DEBUG 04-22 01:40:25 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776822025.6051173, auto_measure=True -(Worker pid=446) DEBUG 04-22 01:40:25 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=445) DEBUG 04-22 01:40:25 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.64GiB, total_memory=79.19GiB, cuda_memory=1.55GiB, torch_memory=0.02GiB, non_torch_memory=1.53GiB, timestamp=1776822025.6054552, auto_measure=True -(Worker pid=445) DEBUG 04-22 01:40:25 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=444) DEBUG 04-22 01:40:25 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=444) DEBUG 04-22 01:40:25 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=444) DEBUG 04-22 01:40:25 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=447) DEBUG 04-22 01:40:25 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=447) DEBUG 04-22 01:40:25 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=444) DEBUG 04-22 01:40:25 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=447) DEBUG 04-22 01:40:25 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=444) DEBUG 04-22 01:40:25 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(Worker pid=446) DEBUG 04-22 01:40:25 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=445) DEBUG 04-22 01:40:25 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=446) DEBUG 04-22 01:40:25 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=445) DEBUG 04-22 01:40:25 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=444) DEBUG 04-22 01:40:25 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=446) DEBUG 04-22 01:40:25 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=445) DEBUG 04-22 01:40:25 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=447) DEBUG 04-22 01:40:25 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=447) DEBUG 04-22 01:40:25 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=446) DEBUG 04-22 01:40:25 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=445) DEBUG 04-22 01:40:25 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=444) DEBUG 04-22 01:40:25 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(Worker_TP0 pid=444) INFO 04-22 01:40:25 [v1/worker/gpu_model_runner.py:4735] Starting to load model Qwen/Qwen2.5-7B-Instruct... -(Worker pid=445) DEBUG 04-22 01:40:25 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=446) DEBUG 04-22 01:40:25 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker_TP0 pid=444) DEBUG 04-22 01:40:26 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(Worker_TP0 pid=444) INFO 04-22 01:40:26 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(Worker_TP0 pid=444) INFO 04-22 01:40:26 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(Worker_TP3 pid=447) DEBUG 04-22 01:40:26 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP3 pid=447) DEBUG 04-22 01:40:26 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP3 pid=447) DEBUG 04-22 01:40:26 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 57, 'silu_and_mul': 28, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP3 pid=447) DEBUG 04-22 01:40:26 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP0 pid=444) DEBUG 04-22 01:40:26 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP0 pid=444) DEBUG 04-22 01:40:26 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP0 pid=444) DEBUG 04-22 01:40:26 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 57, 'silu_and_mul': 28, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP0 pid=444) DEBUG 04-22 01:40:26 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP2 pid=446) DEBUG 04-22 01:40:26 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP2 pid=446) DEBUG 04-22 01:40:26 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP2 pid=446) DEBUG 04-22 01:40:26 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 57, 'silu_and_mul': 28, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP2 pid=446) DEBUG 04-22 01:40:26 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP1 pid=445) DEBUG 04-22 01:40:26 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP1 pid=445) DEBUG 04-22 01:40:26 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP1 pid=445) DEBUG 04-22 01:40:26 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 57, 'silu_and_mul': 28, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP1 pid=445) DEBUG 04-22 01:40:26 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP2 pid=446) DEBUG 04-22 01:40:26 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00003-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00002-of-00004.safetensors', 'model-00001-of-00004.safetensors']] -(Worker_TP0 pid=444) DEBUG 04-22 01:40:26 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00004-of-00004.safetensors', 'model-00002-of-00004.safetensors', 'model-00001-of-00004.safetensors', 'model-00003-of-00004.safetensors']] -(Worker_TP3 pid=447) DEBUG 04-22 01:40:26 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00004.safetensors', 'model-00003-of-00004.safetensors', 'model-00004-of-00004.safetensors', 'model-00001-of-00004.safetensors']] -(Worker_TP0 pid=444) Loading safetensors checkpoint shards: 0% Completed | 0/4 [00:00 -(Worker_TP2 pid=446) DEBUG 04-22 01:40:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=447) DEBUG 04-22 01:40:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=446) DEBUG 04-22 01:40:30 [compilation/decorators.py:528] Start compiling function -(Worker_TP3 pid=447) DEBUG 04-22 01:40:30 [compilation/decorators.py:528] Start compiling function -(Worker_TP0 pid=444) DEBUG 04-22 01:40:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 01:40:30 [compilation/decorators.py:528] Start compiling function -(EngineCore pid=245) DEBUG 04-22 01:40:30 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(APIServer pid=1) DEBUG 04-22 01:40:32 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=40173243b0 comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/926abca848/rank_1_0/backbone -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP1 pid=445) DEBUG 04-22 01:40:33 [compilation/backends.py:1074] Vllm config hash: 40173243b0 -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP0 pid=444) INFO 04-22 01:40:34 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/926abca848/rank_0_0/backbone for vLLM's torch.compile -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=40173243b0 comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/926abca848/rank_0_0/backbone -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=40173243b0 comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/926abca848/rank_2_0/backbone -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] Vllm config hash: 40173243b0 -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] Vllm config hash: 40173243b0 -(Worker_TP0 pid=444) INFO 04-22 01:40:34 [compilation/backends.py:1111] Dynamo bytecode transform time: 3.91 s -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/.../fusion/allreduce_rms_fusion.py:765] Flashinfer max size: 2 MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: 292 -(Worker_TP0 pid=444) INFO 04-22 01:40:34 [distributed/device_communicators/flashinfer_all_reduce.py:109] Auto-selected flashinfer allreduce backend: trtllm -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=40173243b0 comp=e546579c48 code=80ccc4d2070afc59326f063a812f0a14f7bf98a41301c4b1501c344df1a07b44 dir=/data/.cache/vllm/torch_compile_cache/926abca848/rank_3_0/backbone -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [compilation/backends.py:1074] Vllm config hash: 40173243b0 -(Worker_TP0 pid=444) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. -(Worker_TP0 pid=444) return func(*args, **kwargs) -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=0, max_token_num=292, hidden_dim=3584, dtype=torch.bfloat16 -(Worker_TP1 pid=445) DEBUG 04-22 01:40:34 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=1, max_token_num=292, hidden_dim=3584, dtype=torch.bfloat16 -(Worker_TP2 pid=446) DEBUG 04-22 01:40:34 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=2, max_token_num=292, hidden_dim=3584, dtype=torch.bfloat16 -(Worker_TP3 pid=447) DEBUG 04-22 01:40:34 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=4, rank=3, max_token_num=292, hidden_dim=3584, dtype=torch.bfloat16 -(Worker_TP0 pid=444) INFO 04-22 01:40:34 [distributed/device_communicators/flashinfer_all_reduce.py:149] Initialized FlashInfer Allreduce norm fusion workspace with backend=trtllm -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 292), (293, 8192)] -(Worker_TP0 pid=444) DEBUG 04-22 01:40:34 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(Worker_TP2 pid=446) DEBUG 04-22 01:40:35 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP2 pid=446) DEBUG 04-22 01:40:35 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:40:35 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=445) DEBUG 04-22 01:40:35 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:40:35 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=444) DEBUG 04-22 01:40:35 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP3 pid=447) DEBUG 04-22 01:40:35 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP3 pid=447) DEBUG 04-22 01:40:35 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP2 pid=446) DEBUG 04-22 01:40:35 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns -(Worker_TP2 pid=446) DEBUG 04-22 01:40:35 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 34.0 ms -(Worker_TP2 pid=446) DEBUG 04-22 01:40:35 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP2 pid=446) DEBUG 04-22 01:40:35 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes -(Worker_TP2 pid=446) DEBUG 04-22 01:40:35 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:40:35 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns -(Worker_TP1 pid=445) DEBUG 04-22 01:40:35 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 32.9 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:40:35 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:40:35 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes -(Worker_TP1 pid=445) DEBUG 04-22 01:40:35 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:40:35 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns -(Worker_TP0 pid=444) DEBUG 04-22 01:40:35 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 33.8 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:40:35 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:40:35 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes -(Worker_TP0 pid=444) DEBUG 04-22 01:40:35 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP3 pid=447) DEBUG 04-22 01:40:35 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns -(Worker_TP3 pid=447) DEBUG 04-22 01:40:35 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 33.9 ms -(Worker_TP3 pid=447) DEBUG 04-22 01:40:35 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP3 pid=447) DEBUG 04-22 01:40:35 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes -(Worker_TP3 pid=447) DEBUG 04-22 01:40:35 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP0 pid=444) INFO 04-22 01:40:37 [compilation/backends.py:372] Cache the graph of compile range (1, 292) for later use -(Worker_TP0 pid=444) DEBUG 04-22 01:40:37 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 292) from inductor_standalone via handle ('artifact_compile_range_1_292_subgraph_0', '/data/.cache/vllm/torch_compile_cache/926abca848/rank_0_0/backbone/artifact_compile_range_1_292_subgraph_0') -(Worker_TP1 pid=445) DEBUG 04-22 01:40:38 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=445) DEBUG 04-22 01:40:38 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:40:38 [compilation/passes/pass_manager.py:100] Skipping with compile range (293, 8192) -(Worker_TP1 pid=445) DEBUG 04-22 01:40:38 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:40:38 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=445) DEBUG 04-22 01:40:38 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP2 pid=446) DEBUG 04-22 01:40:38 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP2 pid=446) DEBUG 04-22 01:40:38 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP2 pid=446) DEBUG 04-22 01:40:38 [compilation/passes/pass_manager.py:100] Skipping with compile range (293, 8192) -(Worker_TP2 pid=446) DEBUG 04-22 01:40:38 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP2 pid=446) DEBUG 04-22 01:40:38 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP2 pid=446) DEBUG 04-22 01:40:38 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:40:38 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=444) DEBUG 04-22 01:40:38 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:40:38 [compilation/passes/pass_manager.py:100] Skipping with compile range (293, 8192) -(Worker_TP0 pid=444) DEBUG 04-22 01:40:38 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.8 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:40:38 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=444) DEBUG 04-22 01:40:38 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP3 pid=447) DEBUG 04-22 01:40:38 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP3 pid=447) DEBUG 04-22 01:40:38 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP3 pid=447) DEBUG 04-22 01:40:38 [compilation/passes/pass_manager.py:100] Skipping with compile range (293, 8192) -(Worker_TP3 pid=447) DEBUG 04-22 01:40:38 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP3 pid=447) DEBUG 04-22 01:40:38 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP3 pid=447) DEBUG 04-22 01:40:38 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=444) INFO 04-22 01:40:39 [compilation/backends.py:372] Cache the graph of compile range (293, 8192) for later use -(Worker_TP0 pid=444) DEBUG 04-22 01:40:39 [compilation/backends.py:377] Store the 0-th graph for compile range(293, 8192) from inductor_standalone via handle ('artifact_compile_range_293_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/926abca848/rank_0_0/backbone/artifact_compile_range_293_8192_subgraph_0') -(Worker_TP1 pid=445) DEBUG 04-22 01:40:39 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=445) DEBUG 04-22 01:40:39 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP2 pid=446) DEBUG 04-22 01:40:39 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP2 pid=446) DEBUG 04-22 01:40:39 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:40:39 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=444) DEBUG 04-22 01:40:39 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:40:39 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP1 pid=445) DEBUG 04-22 01:40:39 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 60.6 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:40:39 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:40:39 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP1 pid=445) DEBUG 04-22 01:40:39 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP2 pid=446) DEBUG 04-22 01:40:39 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP2 pid=446) DEBUG 04-22 01:40:39 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 59.6 ms -(Worker_TP2 pid=446) DEBUG 04-22 01:40:39 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP2 pid=446) DEBUG 04-22 01:40:39 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP2 pid=446) DEBUG 04-22 01:40:39 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:40:39 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP0 pid=444) DEBUG 04-22 01:40:39 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 60.2 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:40:39 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:40:39 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP0 pid=444) DEBUG 04-22 01:40:39 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP3 pid=447) DEBUG 04-22 01:40:39 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP3 pid=447) DEBUG 04-22 01:40:39 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms -(Worker_TP3 pid=447) DEBUG 04-22 01:40:39 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP3 pid=447) DEBUG 04-22 01:40:39 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 71.6 ms -(Worker_TP3 pid=447) DEBUG 04-22 01:40:39 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP3 pid=447) DEBUG 04-22 01:40:39 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP3 pid=447) DEBUG 04-22 01:40:39 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.6 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:40:39 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 292) from inductor_standalone via handle ('artifact_compile_range_1_292_subgraph_1', '/data/.cache/vllm/torch_compile_cache/926abca848/rank_0_0/backbone/artifact_compile_range_1_292_subgraph_1') -(Worker_TP1 pid=445) DEBUG 04-22 01:40:40 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=445) DEBUG 04-22 01:40:40 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:40:40 [compilation/passes/pass_manager.py:100] Skipping with compile range (293, 8192) -(Worker_TP1 pid=445) DEBUG 04-22 01:40:40 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:40:40 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=445) DEBUG 04-22 01:40:40 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP2 pid=446) DEBUG 04-22 01:40:40 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP2 pid=446) DEBUG 04-22 01:40:40 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP2 pid=446) DEBUG 04-22 01:40:40 [compilation/passes/pass_manager.py:100] Skipping with compile range (293, 8192) -(Worker_TP2 pid=446) DEBUG 04-22 01:40:40 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP2 pid=446) DEBUG 04-22 01:40:40 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP2 pid=446) DEBUG 04-22 01:40:40 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:40:40 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=444) DEBUG 04-22 01:40:40 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:40:40 [compilation/passes/pass_manager.py:100] Skipping with compile range (293, 8192) -(Worker_TP0 pid=444) DEBUG 04-22 01:40:40 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:40:40 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=444) DEBUG 04-22 01:40:40 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP3 pid=447) DEBUG 04-22 01:40:40 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP3 pid=447) DEBUG 04-22 01:40:40 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.1 ms -(Worker_TP3 pid=447) DEBUG 04-22 01:40:40 [compilation/passes/pass_manager.py:100] Skipping with compile range (293, 8192) -(Worker_TP3 pid=447) DEBUG 04-22 01:40:40 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP3 pid=447) DEBUG 04-22 01:40:40 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP3 pid=447) DEBUG 04-22 01:40:40 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:40:42 [compilation/backends.py:377] Store the 1-th graph for compile range(293, 8192) from inductor_standalone via handle ('artifact_compile_range_293_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/926abca848/rank_0_0/backbone/artifact_compile_range_293_8192_subgraph_1') -(APIServer pid=1) DEBUG 04-22 01:40:42 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP2 pid=446) DEBUG 04-22 01:40:43 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP2 pid=446) DEBUG 04-22 01:40:43 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:40:43 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=445) DEBUG 04-22 01:40:43 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP2 pid=446) DEBUG 04-22 01:40:43 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP2 pid=446) DEBUG 04-22 01:40:43 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 60.5 ms -(Worker_TP2 pid=446) DEBUG 04-22 01:40:43 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms -(Worker_TP2 pid=446) DEBUG 04-22 01:40:43 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP2 pid=446) DEBUG 04-22 01:40:43 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:40:43 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=444) DEBUG 04-22 01:40:43 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:40:43 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP1 pid=445) DEBUG 04-22 01:40:43 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 60.6 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:40:43 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.1 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:40:43 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP1 pid=445) DEBUG 04-22 01:40:43 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:40:43 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP0 pid=444) DEBUG 04-22 01:40:43 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 61.4 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:40:43 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:40:43 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP0 pid=444) DEBUG 04-22 01:40:43 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:40:44 [compilation/backends.py:377] Store the 28-th graph for compile range(1, 292) from inductor_standalone via handle ('artifact_compile_range_1_292_subgraph_28', '/data/.cache/vllm/torch_compile_cache/926abca848/rank_0_0/backbone/artifact_compile_range_1_292_subgraph_28') -(Worker_TP0 pid=444) INFO 04-22 01:40:44 [compilation/backends.py:390] Compiling a graph for compile range (1, 292) takes 6.28 s -(Worker_TP2 pid=446) DEBUG 04-22 01:40:44 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP2 pid=446) DEBUG 04-22 01:40:44 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP2 pid=446) DEBUG 04-22 01:40:44 [compilation/passes/pass_manager.py:100] Skipping with compile range (293, 8192) -(Worker_TP2 pid=446) DEBUG 04-22 01:40:44 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP2 pid=446) DEBUG 04-22 01:40:44 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP2 pid=446) DEBUG 04-22 01:40:44 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:40:44 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=445) DEBUG 04-22 01:40:44 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:40:44 [compilation/passes/pass_manager.py:100] Skipping with compile range (293, 8192) -(Worker_TP1 pid=445) DEBUG 04-22 01:40:44 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.5 ms -(Worker_TP1 pid=445) DEBUG 04-22 01:40:44 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=445) DEBUG 04-22 01:40:44 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:40:44 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=444) DEBUG 04-22 01:40:44 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:40:44 [compilation/passes/pass_manager.py:100] Skipping with compile range (293, 8192) -(Worker_TP0 pid=444) DEBUG 04-22 01:40:44 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:40:44 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=444) DEBUG 04-22 01:40:44 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP3 pid=447) DEBUG 04-22 01:40:44 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP3 pid=447) DEBUG 04-22 01:40:44 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:40:44 [compilation/backends.py:377] Store the 28-th graph for compile range(293, 8192) from inductor_standalone via handle ('artifact_compile_range_293_8192_subgraph_28', '/data/.cache/vllm/torch_compile_cache/926abca848/rank_0_0/backbone/artifact_compile_range_293_8192_subgraph_28') -(Worker_TP0 pid=444) INFO 04-22 01:40:44 [compilation/backends.py:390] Compiling a graph for compile range (293, 8192) takes 6.91 s -(Worker_TP3 pid=447) DEBUG 04-22 01:40:44 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP3 pid=447) DEBUG 04-22 01:40:44 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 73.2 ms -(Worker_TP3 pid=447) DEBUG 04-22 01:40:44 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP3 pid=447) DEBUG 04-22 01:40:44 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP3 pid=447) DEBUG 04-22 01:40:44 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=444) DEBUG 04-22 01:40:44 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/926abca848/rank_0_0/backbone/computation_graph.py -(Worker_TP3 pid=447) DEBUG 04-22 01:40:44 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP3 pid=447) DEBUG 04-22 01:40:44 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(Worker_TP3 pid=447) DEBUG 04-22 01:40:44 [compilation/passes/pass_manager.py:100] Skipping with compile range (293, 8192) -(Worker_TP3 pid=447) DEBUG 04-22 01:40:44 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP3 pid=447) DEBUG 04-22 01:40:44 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP3 pid=447) DEBUG 04-22 01:40:44 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP0 pid=444) INFO 04-22 01:40:45 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/f81953de96e95b6cb6058e30a79e97ddbce2a651eea9102c7d30fa4c49e7c4f4/rank_0_0/model -(Worker_TP0 pid=444) INFO 04-22 01:40:45 [compilation/monitor.py:48] torch.compile took 15.26 s in total -(Worker_TP0 pid=444) INFO 04-22 01:40:46 [compilation/monitor.py:76] Initial profiling/warmup run took 1.13 s -(Worker_TP0 pid=444) INFO 04-22 01:40:52 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP0 pid=444) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP0 pid=444) INFO 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_TP3 pid=447) INFO 04-22 01:40:52 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP3 pid=447) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP3 pid=447) INFO 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_TP3 pid=447) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=446) INFO 04-22 01:40:52 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP2 pid=446) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP2 pid=446) INFO 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_TP0 pid=444) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=447) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=447) DEBUG 04-22 01:40:52 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=444) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 01:40:52 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP3 pid=447) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=447) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=447) DEBUG 04-22 01:40:52 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=444) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 01:40:52 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP3 pid=447) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 104.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(Worker_TP3 pid=447) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 104.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(Worker_TP0 pid=444) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=447) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=447) DEBUG 04-22 01:40:52 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=444) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 01:40:52 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP3 pid=447) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=447) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=447) DEBUG 04-22 01:40:52 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=444) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 01:40:52 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP3 pid=447) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 60.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(Worker_TP3 pid=447) INFO 04-22 01:40:52 [distributed/device_communicators/custom_all_reduce.py:216] Registering 228 cuda graph addresses -(Worker_TP0 pid=444) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 60.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(Worker_TP0 pid=444) INFO 04-22 01:40:52 [distributed/device_communicators/custom_all_reduce.py:216] Registering 228 cuda graph addresses -(Worker_TP1 pid=445) INFO 04-22 01:40:52 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP1 pid=445) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP1 pid=445) INFO 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(APIServer pid=1) DEBUG 04-22 01:40:52 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP2 pid=446) DEBUG 04-22 01:40:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=446) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=446) DEBUG 04-22 01:40:53 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP2 pid=446) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=446) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=446) DEBUG 04-22 01:40:53 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP2 pid=446) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 104.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(Worker_TP2 pid=446) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=446) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=446) DEBUG 04-22 01:40:53 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP2 pid=446) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=446) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=446) DEBUG 04-22 01:40:53 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP2 pid=446) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 60.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(Worker_TP2 pid=446) INFO 04-22 01:40:53 [distributed/device_communicators/custom_all_reduce.py:216] Registering 228 cuda graph addresses -(Worker_TP1 pid=445) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 01:40:53 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=445) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 01:40:53 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=445) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 104.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(Worker_TP1 pid=445) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 01:40:53 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=445) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 01:40:53 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=445) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 60.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(Worker_TP1 pid=445) INFO 04-22 01:40:53 [distributed/device_communicators/custom_all_reduce.py:216] Registering 228 cuda graph addresses -(Worker_TP3 pid=447) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP3 pid=447) INFO 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.69 GiB total -(Worker_TP2 pid=446) DEBUG 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP2 pid=446) INFO 04-22 01:40:53 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.69 GiB total -(Worker_TP0 pid=444) DEBUG 04-22 01:40:54 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP0 pid=444) INFO 04-22 01:40:54 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.69 GiB total -(Worker_TP1 pid=445) DEBUG 04-22 01:40:54 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP1 pid=445) INFO 04-22 01:40:54 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.69 GiB total -(Worker_TP3 pid=447) DEBUG 04-22 01:40:54 [v1/worker/gpu_worker.py:424] Initial free memory: 77.64 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP3 pid=447) DEBUG 04-22 01:40:54 [v1/worker/gpu_worker.py:430] Free memory after profiling: 71.44 GiB (total), 69.03 GiB (within requested) -(Worker_TP3 pid=447) DEBUG 04-22 01:40:54 [v1/worker/gpu_worker.py:435] Memory profiling takes 23.92 seconds. Total non KV cache memory: 7.89GiB; torch peak memory increase: 2.21GiB; non-torch forward increase memory: 2.13GiB; weights memory: 3.55GiB. -(Worker_TP3 pid=447) INFO 04-22 01:40:54 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9587 to maintain the same effective KV cache size. -(Worker_TP3 pid=447) DEBUG 04-22 01:40:54 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=444) DEBUG 04-22 01:40:54 [v1/worker/gpu_worker.py:424] Initial free memory: 77.64 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP0 pid=444) DEBUG 04-22 01:40:54 [v1/worker/gpu_worker.py:430] Free memory after profiling: 71.44 GiB (total), 69.03 GiB (within requested) -(Worker_TP0 pid=444) DEBUG 04-22 01:40:54 [v1/worker/gpu_worker.py:435] Memory profiling takes 24.14 seconds. Total non KV cache memory: 7.89GiB; torch peak memory increase: 2.21GiB; non-torch forward increase memory: 2.13GiB; weights memory: 3.55GiB. -(Worker_TP0 pid=444) INFO 04-22 01:40:54 [v1/worker/gpu_worker.py:436] Available KV cache memory: 67.34 GiB -(Worker_TP0 pid=444) INFO 04-22 01:40:54 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9587 to maintain the same effective KV cache size. -(Worker_TP0 pid=444) DEBUG 04-22 01:40:54 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=245) DEBUG 04-22 01:40:54 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=245) DEBUG 04-22 01:40:54 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=445) DEBUG 04-22 01:40:54 [v1/worker/gpu_worker.py:424] Initial free memory: 77.64 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP1 pid=445) DEBUG 04-22 01:40:54 [v1/worker/gpu_worker.py:430] Free memory after profiling: 71.44 GiB (total), 69.03 GiB (within requested) -(Worker_TP1 pid=445) DEBUG 04-22 01:40:54 [v1/worker/gpu_worker.py:435] Memory profiling takes 24.25 seconds. Total non KV cache memory: 7.89GiB; torch peak memory increase: 2.21GiB; non-torch forward increase memory: 2.13GiB; weights memory: 3.55GiB. -(Worker_TP1 pid=445) INFO 04-22 01:40:54 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9587 to maintain the same effective KV cache size. -(Worker_TP1 pid=445) DEBUG 04-22 01:40:54 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=245) DEBUG 04-22 01:40:54 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=245) DEBUG 04-22 01:40:54 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP2 pid=446) DEBUG 04-22 01:40:54 [v1/worker/gpu_worker.py:424] Initial free memory: 77.64 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP2 pid=446) DEBUG 04-22 01:40:54 [v1/worker/gpu_worker.py:430] Free memory after profiling: 71.44 GiB (total), 69.03 GiB (within requested) -(Worker_TP2 pid=446) DEBUG 04-22 01:40:54 [v1/worker/gpu_worker.py:435] Memory profiling takes 24.19 seconds. Total non KV cache memory: 7.89GiB; torch peak memory increase: 2.21GiB; non-torch forward increase memory: 2.13GiB; weights memory: 3.55GiB. -(Worker_TP2 pid=446) INFO 04-22 01:40:54 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9587 to maintain the same effective KV cache size. -(Worker_TP2 pid=446) DEBUG 04-22 01:40:54 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=245) DEBUG 04-22 01:40:54 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=245) INFO 04-22 01:40:54 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 5,043,280 tokens -(EngineCore pid=245) INFO 04-22 01:40:54 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 615.63x -(Worker_TP2 pid=446) DEBUG 04-22 01:40:54 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=445) DEBUG 04-22 01:40:54 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=444) DEBUG 04-22 01:40:54 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP3 pid=447) DEBUG 04-22 01:40:54 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP3 pid=447) 2026-04-22 01:40:54,441 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP2 pid=446) 2026-04-22 01:40:54,441 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP1 pid=445) 2026-04-22 01:40:54,441 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP0 pid=444) 2026-04-22 01:40:54,441 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP3 pid=447) DEBUG 04-22 01:40:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP2 pid=446) DEBUG 04-22 01:40:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=445) DEBUG 04-22 01:40:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) DEBUG 04-22 01:40:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=447) 2026-04-22 01:40:54,457 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP0 pid=444) 2026-04-22 01:40:54,458 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP1 pid=445) 2026-04-22 01:40:54,458 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP2 pid=446) 2026-04-22 01:40:54,458 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP3 pid=447) DEBUG 04-22 01:40:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=447) DEBUG 04-22 01:40:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=447) DEBUG 04-22 01:40:54 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP3 pid=447) DEBUG 04-22 01:40:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=447) DEBUG 04-22 01:40:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP3 pid=447) DEBUG 04-22 01:40:54 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP3 pid=447) DEBUG 04-22 01:40:54 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=444) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=245) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=245) INFO 04-22 01:40:59 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(Worker_TP0 pid=444) DEBUG 04-22 01:40:59 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP2 pid=446) DEBUG 04-22 01:40:59 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=445) DEBUG 04-22 01:40:59 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP3 pid=447) DEBUG 04-22 01:40:59 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=245) DEBUG 04-22 01:40:59 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=245) DEBUG 04-22 01:40:59 [v1/engine/core.py:1158] EngineCore waiting for work. -(EngineCore pid=245) DEBUG 04-22 01:40:59 [v1/engine/core.py:1158] EngineCore waiting for work. -(APIServer pid=1) INFO 04-22 01:40:59 [entrypoints/openai/api_server.py:590] Supported tasks: ['generate'] -(APIServer pid=1) WARNING 04-22 01:41:00 [config/model.py:1435] Default vLLM sampling parameters have been overridden by the model's `generation_config.json`: `{'repetition_penalty': 1.05, 'temperature': 0.7, 'top_k': 20, 'top_p': 0.8}`. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`. -(APIServer pid=1) DEBUG 04-22 01:41:00 [renderers/base.py:197] Warming up chat template processing... -(APIServer pid=1) DEBUG 04-22 01:41:00 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e8272c-2c488e1d2cc021b830dd2d98;f63a5c98-c381-4f93-974b-8d392aa77a95) -(APIServer pid=1) DEBUG 04-22 01:41:00 [transformers_utils/repo_utils.py:243] -(APIServer pid=1) DEBUG 04-22 01:41:00 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/resolve/main/processor_config.json. -(APIServer pid=1) DEBUG 04-22 01:41:00 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e8272c-779a7a1a4cc3e6661e65eebc;e6a5907f-a49c-4e52-9bc2-7db82e7d73db) -(APIServer pid=1) DEBUG 04-22 01:41:00 [transformers_utils/repo_utils.py:243] -(APIServer pid=1) DEBUG 04-22 01:41:00 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/Qwen/Qwen2.5-7B-Instruct/resolve/main/preprocessor_config.json. -(Worker_TP1 pid=445) DEBUG 04-22 01:41:00 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=444) DEBUG 04-22 01:41:00 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP2 pid=446) DEBUG 04-22 01:41:00 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP3 pid=447) DEBUG 04-22 01:41:00 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(APIServer pid=1) INFO 04-22 01:41:01 [renderers/hf.py:314] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this. -(APIServer pid=1) DEBUG 04-22 01:41:01 [renderers/base.py:203] Chat template warmup completed in 0.733s -(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/openai/api_server.py:594] Starting vLLM server on http://0.0.0.0:8000 -(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:37] Available routes are: -(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /openapi.json, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /docs, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /docs/oauth2-redirect, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /redoc, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /tokenize, Methods: POST -(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /detokenize, Methods: POST -(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /load, Methods: GET -(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /version, Methods: GET -(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /health, Methods: GET -(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /metrics, Methods: GET -(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /v1/models, Methods: GET -(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /ping, Methods: GET -(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /ping, Methods: POST -(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /invocations, Methods: POST -(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /v1/chat/completions, Methods: POST -(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST -(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /v1/responses, Methods: POST -(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET -(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST -(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /v1/completions, Methods: POST -(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /v1/messages, Methods: POST -(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST -(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /inference/v1/generate, Methods: POST -(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /scale_elastic_ep, Methods: POST -(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST -(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /v1/chat/completions/render, Methods: POST -(APIServer pid=1) INFO 04-22 01:41:01 [entrypoints/launcher.py:46] Route: /v1/completions/render, Methods: POST -(APIServer pid=1) INFO: Started server process [1] -(APIServer pid=1) INFO: Waiting for application startup. -(APIServer pid=1) INFO: Application startup complete. -(APIServer pid=1) DEBUG 04-22 01:41:09 [v1/engine/async_llm.py:875] Called check_health. -(APIServer pid=1) INFO: 10.131.2.2:40078 - "GET /health HTTP/1.1" 200 OK diff --git a/accuracy/results/v0.19.0/logs/qwen-qwen3-14b--h100-80gb--tp5pp1dp1--8192.FAILED.log b/accuracy/results/v0.19.0/logs/qwen-qwen3-14b--h100-80gb--tp5pp1dp1--8192.FAILED.log deleted file mode 100644 index 8c18899e..00000000 --- a/accuracy/results/v0.19.0/logs/qwen-qwen3-14b--h100-80gb--tp5pp1dp1--8192.FAILED.log +++ /dev/null @@ -1,675 +0,0 @@ -DEBUG 04-22 00:27:10 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:27:10 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:27:10 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:27:10 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:27:10 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:27:10 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:27:10 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:27:10 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:27:10 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:27:10 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:27:10 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:27:10 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:27:15 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 00:27:17 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' -DEBUG 04-22 00:27:17 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 00:27:17 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:27:17 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:27:17 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 00:27:17 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:27:17 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 00:27:17 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 00:27:17 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model Qwen/Qwen3-14B -(APIServer pid=1) INFO 04-22 00:27:17 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 00:27:17 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:27:17 [entrypoints/utils.py:233] non-default args: {'model_tag': 'Qwen/Qwen3-14B', 'model': 'Qwen/Qwen3-14B', 'max_model_len': 8192, 'tensor_parallel_size': 5, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 00:27:17 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 00:27:17 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen3.Qwen3ForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 00:27:17 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0010808 secs -(APIServer pid=1) INFO 04-22 00:27:17 [config/model.py:549] Resolved architecture: Qwen3ForCausalLM -(APIServer pid=1) INFO 04-22 00:27:17 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 00:27:17 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 00:27:17 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 00:27:17 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 00:27:17 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 00:27:17 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 00:27:17 [config/parallel.py:743] Defaulting to use mp for distributed inference -(APIServer pid=1) INFO 04-22 00:27:17 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 00:27:17 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) INFO 04-22 00:27:17 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(APIServer pid=1) DEBUG 04-22 00:27:17 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 00:27:17 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:27:18 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:27:18 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 00:27:21 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:27:21 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:27:21 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:27:21 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:27:21 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:27:21 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:27:21 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:27:21 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:27:21 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:27:21 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:27:21 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:27:21 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:27:26 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=243) DEBUG 04-22 00:27:28 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 00:27:28 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=243) DEBUG 04-22 00:27:28 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/1bc2d50e-7e9a-4172-a205-50e79bb5d230'], outputs=['ipc:///tmp/442333a8-48d7-49cf-8792-67cac68c97ad'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=243) DEBUG 04-22 00:27:28 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=243) DEBUG 04-22 00:27:28 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=243) DEBUG 04-22 00:27:28 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=243) DEBUG 04-22 00:27:28 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=243) DEBUG 04-22 00:27:28 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=243) INFO 04-22 00:27:28 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='Qwen/Qwen3-14B', speculative_config=None, tokenizer='Qwen/Qwen3-14B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=5, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen3-14B, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=243) WARNING 04-22 00:27:28 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. -(EngineCore pid=243) INFO 04-22 00:27:28 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.128.4.195 (local), world_size=5, local_world_size=5 -(EngineCore pid=243) DEBUG 04-22 00:27:28 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/63114b8d-5acb-4b99-8fba-8ac83335cae3 -(EngineCore pid=243) DEBUG 04-22 00:27:28 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1, 2, 3, 4], buffer_handle=(5, 16777216, 10, 'psm_374da923'), local_subscribe_addr='ipc:///tmp/63114b8d-5acb-4b99-8fba-8ac83335cae3', local_notify_addr='ipc:///tmp/82d7443c-6675-4caa-abc2-1d5430bdfb80', remote_subscribe_addr=None, remote_addr_ipv6=False) -DEBUG 04-22 00:27:31 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:27:31 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:27:31 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:27:31 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:27:31 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:27:31 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:27:31 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:27:31 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:27:31 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:27:31 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:27:31 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:27:31 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:27:31 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:27:31 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:27:31 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:27:31 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:27:31 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:27:31 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:27:31 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:27:31 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:27:31 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:27:31 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:27:31 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:27:31 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:27:31 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:27:36 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 00:27:36 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 00:27:37 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 00:27:37 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 00:27:37 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(APIServer pid=1) DEBUG 04-22 00:27:38 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -DEBUG 04-22 00:27:38 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 00:27:38 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:27:38 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:27:38 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 00:27:38 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 00:27:38 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:27:38 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:27:38 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 00:27:38 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 00:27:38 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:27:38 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:27:38 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 00:27:38 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 00:27:38 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:27:38 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:27:38 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 00:27:38 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 00:27:38 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:27:38 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:27:38 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(Worker pid=442) DEBUG 04-22 00:27:40 [distributed/parallel_state.py:1356] world_size=5 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:60649 backend=nccl -(Worker pid=442) INFO 04-22 00:27:40 [distributed/parallel_state.py:1400] world_size=5 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:60649 backend=nccl -(Worker pid=446) DEBUG 04-22 00:27:41 [distributed/parallel_state.py:1356] world_size=5 rank=4 local_rank=4 distributed_init_method=tcp://127.0.0.1:60649 backend=nccl -(Worker pid=446) INFO 04-22 00:27:41 [distributed/parallel_state.py:1400] world_size=5 rank=4 local_rank=4 distributed_init_method=tcp://127.0.0.1:60649 backend=nccl -(Worker pid=444) DEBUG 04-22 00:27:41 [distributed/parallel_state.py:1356] world_size=5 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:60649 backend=nccl -(Worker pid=444) INFO 04-22 00:27:41 [distributed/parallel_state.py:1400] world_size=5 rank=2 local_rank=2 distributed_init_method=tcp://127.0.0.1:60649 backend=nccl -(Worker pid=445) DEBUG 04-22 00:27:41 [distributed/parallel_state.py:1356] world_size=5 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:60649 backend=nccl -(Worker pid=445) INFO 04-22 00:27:41 [distributed/parallel_state.py:1400] world_size=5 rank=3 local_rank=3 distributed_init_method=tcp://127.0.0.1:60649 backend=nccl -(Worker pid=443) DEBUG 04-22 00:27:41 [distributed/parallel_state.py:1356] world_size=5 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:60649 backend=nccl -(Worker pid=443) INFO 04-22 00:27:41 [distributed/parallel_state.py:1400] world_size=5 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:60649 backend=nccl -[Gloo] Rank 1 is connected to 4 peer ranks. Expected number of connected peer ranks is : 4 -[Gloo] Rank 0 is connected to 4 peer ranks. Expected number of connected peer ranks is : 4 -[Gloo] Rank 2 is connected to 4 peer ranks. Expected number of connected peer ranks is : 4 -[Gloo] Rank 4 is connected to 4 peer ranks. Expected number of connected peer ranks is : 4 -[Gloo] Rank 3 is connected to 4 peer ranks. Expected number of connected peer ranks is : 4 -(Worker pid=445) DEBUG 04-22 00:27:41 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=444) DEBUG 04-22 00:27:41 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=443) DEBUG 04-22 00:27:41 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=442) DEBUG 04-22 00:27:41 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=446) DEBUG 04-22 00:27:41 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 2 is connected to 4 peer ranks. Expected number of connected peer ranks is : 4 -[Gloo] Rank 1 is connected to 4 peer ranks. Expected number of connected peer ranks is : 4 -[Gloo] Rank 0 is connected to 4 peer ranks. Expected number of connected peer ranks is : 4 -[Gloo] Rank 3 is connected to 4 peer ranks. Expected number of connected peer ranks is : 4 -[Gloo] Rank 4 is connected to 4 peer ranks. Expected number of connected peer ranks is : 4 -(Worker pid=444) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=446) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=444) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=446) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=442) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=442) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=442) DEBUG 04-22 00:27:41 [utils/nccl.py:34] Found nccl from library libnccl.so.2 -(Worker pid=442) INFO 04-22 00:27:41 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 -(Worker pid=445) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=445) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=443) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=443) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=442) WARNING 04-22 00:27:42 [distributed/device_communicators/symm_mem.py:73] SymmMemCommunicator: World size 5 not supported, communicator is not available. -(Worker pid=445) WARNING 04-22 00:27:42 [distributed/device_communicators/symm_mem.py:73] SymmMemCommunicator: World size 5 not supported, communicator is not available. -(Worker pid=443) WARNING 04-22 00:27:42 [distributed/device_communicators/symm_mem.py:73] SymmMemCommunicator: World size 5 not supported, communicator is not available. -(Worker pid=446) WARNING 04-22 00:27:42 [distributed/device_communicators/symm_mem.py:73] SymmMemCommunicator: World size 5 not supported, communicator is not available. -(Worker pid=444) WARNING 04-22 00:27:42 [distributed/device_communicators/symm_mem.py:73] SymmMemCommunicator: World size 5 not supported, communicator is not available. -(Worker pid=444) WARNING 04-22 00:27:42 [distributed/device_communicators/custom_all_reduce.py:106] Custom allreduce is disabled due to an unsupported world size: 5. Supported world sizes: [2, 4, 6, 8]. To silence this warning, specify disable_custom_all_reduce=True explicitly. -(Worker pid=442) WARNING 04-22 00:27:42 [distributed/device_communicators/custom_all_reduce.py:106] Custom allreduce is disabled due to an unsupported world size: 5. Supported world sizes: [2, 4, 6, 8]. To silence this warning, specify disable_custom_all_reduce=True explicitly. -(Worker pid=446) WARNING 04-22 00:27:42 [distributed/device_communicators/custom_all_reduce.py:106] Custom allreduce is disabled due to an unsupported world size: 5. Supported world sizes: [2, 4, 6, 8]. To silence this warning, specify disable_custom_all_reduce=True explicitly. -(Worker pid=443) WARNING 04-22 00:27:42 [distributed/device_communicators/custom_all_reduce.py:106] Custom allreduce is disabled due to an unsupported world size: 5. Supported world sizes: [2, 4, 6, 8]. To silence this warning, specify disable_custom_all_reduce=True explicitly. -(Worker pid=445) WARNING 04-22 00:27:42 [distributed/device_communicators/custom_all_reduce.py:106] Custom allreduce is disabled due to an unsupported world size: 5. Supported world sizes: [2, 4, 6, 8]. To silence this warning, specify disable_custom_all_reduce=True explicitly. -(Worker pid=442) DEBUG 04-22 00:27:42 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/7138be5c-96a9-4137-95c0-b4d63d41bb97 -(Worker pid=442) DEBUG 04-22 00:27:42 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1, 2, 3, 4], buffer_handle=(4, 4194304, 6, 'psm_973a6fc4'), local_subscribe_addr='ipc:///tmp/7138be5c-96a9-4137-95c0-b4d63d41bb97', local_notify_addr='ipc:///tmp/835c2a1c-53fe-42e2-9c0c-85a4cc18cdd1', remote_subscribe_addr=None, remote_addr_ipv6=False) -(Worker pid=446) DEBUG 04-22 00:27:42 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/7138be5c-96a9-4137-95c0-b4d63d41bb97 -(Worker pid=444) DEBUG 04-22 00:27:42 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/7138be5c-96a9-4137-95c0-b4d63d41bb97 -(Worker pid=445) DEBUG 04-22 00:27:42 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/7138be5c-96a9-4137-95c0-b4d63d41bb97 -(Worker pid=443) DEBUG 04-22 00:27:42 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/7138be5c-96a9-4137-95c0-b4d63d41bb97 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(Worker pid=442) INFO 04-22 00:27:42 [distributed/parallel_state.py:1716] rank 0 in world size 5 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(Worker pid=444) DEBUG 04-22 00:27:43 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=77.71GiB, total_memory=79.19GiB, cuda_memory=1.48GiB, torch_memory=0.0GiB, non_torch_memory=1.48GiB, timestamp=1776817663.1936586, auto_measure=True -(Worker pid=444) DEBUG 04-22 00:27:43 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=446) DEBUG 04-22 00:27:43 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=77.71GiB, total_memory=79.19GiB, cuda_memory=1.48GiB, torch_memory=0.0GiB, non_torch_memory=1.48GiB, timestamp=1776817663.2081609, auto_measure=True -(Worker pid=446) DEBUG 04-22 00:27:43 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=442) DEBUG 04-22 00:27:43 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=77.71GiB, total_memory=79.19GiB, cuda_memory=1.48GiB, torch_memory=0.0GiB, non_torch_memory=1.48GiB, timestamp=1776817663.2475388, auto_measure=True -(Worker pid=442) DEBUG 04-22 00:27:43 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=445) DEBUG 04-22 00:27:43 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=77.71GiB, total_memory=79.19GiB, cuda_memory=1.48GiB, torch_memory=0.0GiB, non_torch_memory=1.48GiB, timestamp=1776817663.2508624, auto_measure=True -(Worker pid=445) DEBUG 04-22 00:27:43 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=444) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=443) DEBUG 04-22 00:27:43 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=77.71GiB, total_memory=79.19GiB, cuda_memory=1.48GiB, torch_memory=0.0GiB, non_torch_memory=1.48GiB, timestamp=1776817663.290896, auto_measure=True -(Worker pid=443) DEBUG 04-22 00:27:43 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=444) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=444) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=446) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=446) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=446) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=444) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=442) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=442) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=446) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=444) DEBUG 04-22 00:27:43 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=445) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=445) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=442) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=446) DEBUG 04-22 00:27:43 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=445) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=442) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=443) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=443) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=445) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=442) DEBUG 04-22 00:27:43 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(Worker pid=443) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=442) DEBUG 04-22 00:27:43 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=445) DEBUG 04-22 00:27:43 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=443) DEBUG 04-22 00:27:43 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=443) DEBUG 04-22 00:27:43 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=442) DEBUG 04-22 00:27:43 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(Worker_TP0 pid=442) INFO 04-22 00:27:43 [v1/worker/gpu_model_runner.py:4735] Starting to load model Qwen/Qwen3-14B... -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] WorkerProc failed to start. -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] Traceback (most recent call last): -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 826, in worker_main -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] worker = WorkerProc(*args, **kwargs) -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 613, in __init__ -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.worker.load_model() -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 323, in load_model -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.model_runner.load_model(load_dummy_weights=load_dummy_weights) -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 4751, in load_model -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.model = model_loader.load_model( -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/base_loader.py", line 55, in load_model -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] model = initialize_model( -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^ -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/utils.py", line 57, in initialize_model -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] model = model_class(vllm_config=vllm_config, prefix=prefix) -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3.py", line 289, in __init__ -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.model = Qwen3Model( -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^ -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 364, in __init__ -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] old_init(self, **kwargs) -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3.py", line 256, in __init__ -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] super().__init__( -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 364, in __init__ -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] old_init(self, **kwargs) -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py", line 391, in __init__ -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.embed_tokens = VocabParallelEmbedding( -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 254, in __init__ -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.shard_indices = self._get_indices( -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^ -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 329, in _get_indices -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] vocab_range_from_global_vocab_size(org_vocab_size_padded, tp_rank, tp_size) -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 91, in vocab_range_from_global_vocab_size -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] per_partition_vocab_size = divide(global_vocab_size, world_size) -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/utils.py", line 63, in divide -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ensure_divisibility(numerator, denominator) -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/utils.py", line 55, in ensure_divisibility -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] assert numerator % denominator == 0, "{} is not divisible by {}".format( -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP4 pid=446) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] AssertionError: 151936 is not divisible by 5 -(EngineCore pid=243) DEBUG 04-22 00:27:44 [v1/executor/multiproc_executor.py:419] Worker Termination: allow workers to gracefully shutdown -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] WorkerProc failed to start. -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] Traceback (most recent call last): -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 826, in worker_main -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] worker = WorkerProc(*args, **kwargs) -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 613, in __init__ -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.worker.load_model() -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 323, in load_model -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.model_runner.load_model(load_dummy_weights=load_dummy_weights) -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 4751, in load_model -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.model = model_loader.load_model( -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/base_loader.py", line 55, in load_model -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] model = initialize_model( -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^ -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/utils.py", line 57, in initialize_model -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] model = model_class(vllm_config=vllm_config, prefix=prefix) -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3.py", line 289, in __init__ -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.model = Qwen3Model( -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^ -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 364, in __init__ -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] old_init(self, **kwargs) -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3.py", line 256, in __init__ -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] super().__init__( -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 364, in __init__ -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] old_init(self, **kwargs) -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py", line 391, in __init__ -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.embed_tokens = VocabParallelEmbedding( -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 254, in __init__ -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.shard_indices = self._get_indices( -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^ -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 329, in _get_indices -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] vocab_range_from_global_vocab_size(org_vocab_size_padded, tp_rank, tp_size) -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 91, in vocab_range_from_global_vocab_size -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] per_partition_vocab_size = divide(global_vocab_size, world_size) -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/utils.py", line 63, in divide -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ensure_divisibility(numerator, denominator) -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/utils.py", line 55, in ensure_divisibility -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] assert numerator % denominator == 0, "{} is not divisible by {}".format( -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP2 pid=444) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] AssertionError: 151936 is not divisible by 5 -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] WorkerProc failed to start. -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] Traceback (most recent call last): -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 826, in worker_main -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] worker = WorkerProc(*args, **kwargs) -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 613, in __init__ -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.worker.load_model() -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 323, in load_model -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.model_runner.load_model(load_dummy_weights=load_dummy_weights) -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 4751, in load_model -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.model = model_loader.load_model( -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/base_loader.py", line 55, in load_model -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] model = initialize_model( -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^ -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/utils.py", line 57, in initialize_model -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] model = model_class(vllm_config=vllm_config, prefix=prefix) -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3.py", line 289, in __init__ -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.model = Qwen3Model( -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^ -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 364, in __init__ -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] old_init(self, **kwargs) -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3.py", line 256, in __init__ -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] super().__init__( -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 364, in __init__ -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] old_init(self, **kwargs) -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py", line 391, in __init__ -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.embed_tokens = VocabParallelEmbedding( -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 254, in __init__ -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.shard_indices = self._get_indices( -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^ -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 329, in _get_indices -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] vocab_range_from_global_vocab_size(org_vocab_size_padded, tp_rank, tp_size) -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 91, in vocab_range_from_global_vocab_size -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] per_partition_vocab_size = divide(global_vocab_size, world_size) -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/utils.py", line 63, in divide -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ensure_divisibility(numerator, denominator) -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/utils.py", line 55, in ensure_divisibility -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] assert numerator % denominator == 0, "{} is not divisible by {}".format( -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP3 pid=445) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] AssertionError: 151936 is not divisible by 5 -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] WorkerProc failed to start. -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] Traceback (most recent call last): -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 826, in worker_main -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] worker = WorkerProc(*args, **kwargs) -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 613, in __init__ -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.worker.load_model() -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 323, in load_model -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.model_runner.load_model(load_dummy_weights=load_dummy_weights) -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 4751, in load_model -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.model = model_loader.load_model( -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/base_loader.py", line 55, in load_model -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] model = initialize_model( -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^ -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/utils.py", line 57, in initialize_model -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] model = model_class(vllm_config=vllm_config, prefix=prefix) -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3.py", line 289, in __init__ -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.model = Qwen3Model( -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^ -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 364, in __init__ -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] old_init(self, **kwargs) -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3.py", line 256, in __init__ -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] super().__init__( -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 364, in __init__ -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] old_init(self, **kwargs) -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py", line 391, in __init__ -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.embed_tokens = VocabParallelEmbedding( -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 254, in __init__ -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.shard_indices = self._get_indices( -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^ -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 329, in _get_indices -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] vocab_range_from_global_vocab_size(org_vocab_size_padded, tp_rank, tp_size) -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 91, in vocab_range_from_global_vocab_size -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] per_partition_vocab_size = divide(global_vocab_size, world_size) -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/utils.py", line 63, in divide -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ensure_divisibility(numerator, denominator) -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/utils.py", line 55, in ensure_divisibility -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] assert numerator % denominator == 0, "{} is not divisible by {}".format( -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP1 pid=443) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] AssertionError: 151936 is not divisible by 5 -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] WorkerProc failed to start. -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] Traceback (most recent call last): -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 826, in worker_main -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] worker = WorkerProc(*args, **kwargs) -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 613, in __init__ -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.worker.load_model() -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_worker.py", line 323, in load_model -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.model_runner.load_model(load_dummy_weights=load_dummy_weights) -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/worker/gpu_model_runner.py", line 4751, in load_model -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.model = model_loader.load_model( -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/base_loader.py", line 55, in load_model -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] model = initialize_model( -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^ -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] return func(*args, **kwargs) -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/model_loader/utils.py", line 57, in initialize_model -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] model = model_class(vllm_config=vllm_config, prefix=prefix) -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3.py", line 289, in __init__ -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.model = Qwen3Model( -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^ -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 364, in __init__ -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] old_init(self, **kwargs) -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3.py", line 256, in __init__ -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] super().__init__( -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/compilation/decorators.py", line 364, in __init__ -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] old_init(self, **kwargs) -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py", line 391, in __init__ -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.embed_tokens = VocabParallelEmbedding( -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 254, in __init__ -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] self.shard_indices = self._get_indices( -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^ -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 329, in _get_indices -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] vocab_range_from_global_vocab_size(org_vocab_size_padded, tp_rank, tp_size) -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 91, in vocab_range_from_global_vocab_size -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] per_partition_vocab_size = divide(global_vocab_size, world_size) -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/utils.py", line 63, in divide -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ensure_divisibility(numerator, denominator) -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/utils.py", line 55, in ensure_divisibility -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] assert numerator % denominator == 0, "{} is not divisible by {}".format( -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(Worker_TP0 pid=442) ERROR 04-22 00:27:44 [v1/executor/multiproc_executor.py:857] AssertionError: 151936 is not divisible by 5 -[rank0]:[W422 00:27:45.545461215 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) -(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] EngineCore failed to start. -(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] Traceback (most recent call last): -(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1082, in run_engine_core -(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs) -(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] return func(*args, **kwargs) -(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^ -(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 848, in __init__ -(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] super().__init__( -(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 114, in __init__ -(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] self.model_executor = executor_class(vllm_config) -(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 101, in __init__ -(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] super().__init__(vllm_config) -(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] return func(*args, **kwargs) -(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^ -(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 103, in __init__ -(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] self._init_executor() -(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 190, in _init_executor -(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] self.workers = WorkerProc.wait_for_ready(unready_workers) -(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 736, in wait_for_ready -(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] raise e from None -(EngineCore pid=243) ERROR 04-22 00:27:46 [v1/engine/core.py:1108] Exception: WorkerProc initialization failed due to an exception in a background process. See stack trace for root cause. -(EngineCore pid=243) Process EngineCore: -(EngineCore pid=243) Traceback (most recent call last): -(EngineCore pid=243) File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap -(EngineCore pid=243) self.run() -(EngineCore pid=243) File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run -(EngineCore pid=243) self._target(*self._args, **self._kwargs) -(EngineCore pid=243) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1112, in run_engine_core -(EngineCore pid=243) raise e -(EngineCore pid=243) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 1082, in run_engine_core -(EngineCore pid=243) engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs) -(EngineCore pid=243) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(EngineCore pid=243) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(EngineCore pid=243) return func(*args, **kwargs) -(EngineCore pid=243) ^^^^^^^^^^^^^^^^^^^^^ -(EngineCore pid=243) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 848, in __init__ -(EngineCore pid=243) super().__init__( -(EngineCore pid=243) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 114, in __init__ -(EngineCore pid=243) self.model_executor = executor_class(vllm_config) -(EngineCore pid=243) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(EngineCore pid=243) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 101, in __init__ -(EngineCore pid=243) super().__init__(vllm_config) -(EngineCore pid=243) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(EngineCore pid=243) return func(*args, **kwargs) -(EngineCore pid=243) ^^^^^^^^^^^^^^^^^^^^^ -(EngineCore pid=243) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/abstract.py", line 103, in __init__ -(EngineCore pid=243) self._init_executor() -(EngineCore pid=243) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 190, in _init_executor -(EngineCore pid=243) self.workers = WorkerProc.wait_for_ready(unready_workers) -(EngineCore pid=243) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(EngineCore pid=243) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/executor/multiproc_executor.py", line 736, in wait_for_ready -(EngineCore pid=243) raise e from None -(EngineCore pid=243) Exception: WorkerProc initialization failed due to an exception in a background process. See stack trace for root cause. -(EngineCore pid=243) DEBUG 04-22 00:27:46 [v1/executor/multiproc_executor.py:438] Triggering shutdown of workers -(APIServer pid=1) Traceback (most recent call last): -(APIServer pid=1) File "/usr/local/bin/vllm", line 10, in -(APIServer pid=1) sys.exit(main()) -(APIServer pid=1) ^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/main.py", line 75, in main -(APIServer pid=1) args.dispatch_function(args) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/cli/serve.py", line 122, in cmd -(APIServer pid=1) uvloop.run(run_server(args)) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 96, in run -(APIServer pid=1) return __asyncio.run( -(APIServer pid=1) ^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run -(APIServer pid=1) return runner.run(main) -(APIServer pid=1) ^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run -(APIServer pid=1) return self._loop.run_until_complete(task) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 48, in wrapper -(APIServer pid=1) return await main -(APIServer pid=1) ^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 670, in run_server -(APIServer pid=1) await run_server_worker(listen_address, sock, args, **uvicorn_kwargs) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 684, in run_server_worker -(APIServer pid=1) async with build_async_engine_client( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ -(APIServer pid=1) return await anext(self.gen) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 100, in build_async_engine_client -(APIServer pid=1) async with build_async_engine_client_from_engine_args( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ -(APIServer pid=1) return await anext(self.gen) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 136, in build_async_engine_client_from_engine_args -(APIServer pid=1) async_llm = AsyncLLM.from_vllm_config( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 225, in from_vllm_config -(APIServer pid=1) return cls( -(APIServer pid=1) ^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/async_llm.py", line 154, in __init__ -(APIServer pid=1) self.engine_core = EngineCoreClient.make_async_mp_client( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(APIServer pid=1) return func(*args, **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 130, in make_async_mp_client -(APIServer pid=1) return AsyncMPClient(*client_args) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/tracing/otel.py", line 178, in sync_wrapper -(APIServer pid=1) return func(*args, **kwargs) -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 887, in __init__ -(APIServer pid=1) super().__init__( -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core_client.py", line 535, in __init__ -(APIServer pid=1) with launch_core_engines( -(APIServer pid=1) ^^^^^^^^^^^^^^^^^^^^ -(APIServer pid=1) File "/usr/lib/python3.12/contextlib.py", line 144, in __exit__ -(APIServer pid=1) next(self.gen) -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 998, in launch_core_engines -(APIServer pid=1) wait_for_engine_startup( -(APIServer pid=1) File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/utils.py", line 1057, in wait_for_engine_startup -(APIServer pid=1) raise RuntimeError( -(APIServer pid=1) RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {} -/usr/lib/python3.12/multiprocessing/resource_tracker.py:279: UserWarning: resource_tracker: There appear to be 1 leaked shared_memory objects to clean up at shutdown - warnings.warn('resource_tracker: There appear to be %d ' diff --git a/accuracy/results/v0.19.0/logs/qwen-qwen3-30b-a3b--h100-80gb--tp1pp1dp1--8192.log b/accuracy/results/v0.19.0/logs/qwen-qwen3-30b-a3b--h100-80gb--tp1pp1dp1--8192.log deleted file mode 100644 index 98b73b84..00000000 --- a/accuracy/results/v0.19.0/logs/qwen-qwen3-30b-a3b--h100-80gb--tp1pp1dp1--8192.log +++ /dev/null @@ -1,774 +0,0 @@ -DEBUG 04-22 01:06:00 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:06:00 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:06:00 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:06:00 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:06:00 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:06:00 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:06:00 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:06:00 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:06:00 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:06:00 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:06:00 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:06:00 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:06:04 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:06:06 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' -DEBUG 04-22 01:06:06 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:06:06 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:06:06 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:06:06 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 01:06:06 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:06:06 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 01:06:06 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 01:06:06 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model Qwen/Qwen3-30B-A3B -(APIServer pid=1) INFO 04-22 01:06:06 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 01:06:06 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:06:06 [entrypoints/utils.py:233] non-default args: {'model_tag': 'Qwen/Qwen3-30B-A3B', 'model': 'Qwen/Qwen3-30B-A3B', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 01:06:06 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 01:06:07 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen3_moe.Qwen3MoeForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 01:06:07 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0016402 secs -(APIServer pid=1) INFO 04-22 01:06:07 [config/model.py:549] Resolved architecture: Qwen3MoeForCausalLM -(APIServer pid=1) INFO 04-22 01:06:07 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 01:06:07 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 01:06:07 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 01:06:07 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 01:06:07 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 01:06:07 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 01:06:07 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 01:06:07 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 01:06:08 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 01:06:08 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:06:09 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:06:09 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 01:06:13 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:06:13 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:06:13 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:06:13 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:06:13 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:06:13 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:06:13 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:06:13 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:06:13 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:06:13 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:06:13 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:06:13 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:06:18 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=243) DEBUG 04-22 01:06:19 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 01:06:19 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=243) DEBUG 04-22 01:06:19 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/3e2c30c9-94dc-4d97-b759-8f269957db68'], outputs=['ipc:///tmp/c9416175-aef4-4b66-9761-7180c0c8b868'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=243) DEBUG 04-22 01:06:19 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=243) DEBUG 04-22 01:06:19 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=243) DEBUG 04-22 01:06:19 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=243) DEBUG 04-22 01:06:19 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=243) DEBUG 04-22 01:06:19 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=243) INFO 04-22 01:06:19 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='Qwen/Qwen3-30B-A3B', speculative_config=None, tokenizer='Qwen/Qwen3-30B-A3B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen3-30B-A3B, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=243) DEBUG 04-22 01:06:20 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.131.7.86:46973 backend=nccl -(EngineCore pid=243) INFO 04-22 01:06:20 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.131.7.86:46973 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=243) DEBUG 04-22 01:06:20 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=243) INFO 04-22 01:06:20 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N/A -(EngineCore pid=243) DEBUG 04-22 01:06:20 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776819980.5765347, auto_measure=True -(EngineCore pid=243) DEBUG 04-22 01:06:20 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=243) DEBUG 04-22 01:06:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=243) DEBUG 04-22 01:06:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=243) DEBUG 04-22 01:06:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=243) DEBUG 04-22 01:06:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=243) DEBUG 04-22 01:06:20 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=243) DEBUG 04-22 01:06:20 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=243) DEBUG 04-22 01:06:20 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=243) DEBUG 04-22 01:06:20 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=243) INFO 04-22 01:06:20 [v1/worker/gpu_model_runner.py:4735] Starting to load model Qwen/Qwen3-30B-A3B... -(EngineCore pid=243) DEBUG 04-22 01:06:21 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=243) INFO 04-22 01:06:21 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=243) INFO 04-22 01:06:21 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=243) INFO 04-22 01:06:21 [model_executor/.../oracle/unquantized.py:186] Using TRITON backend for Unquantized MoE -(EngineCore pid=243) DEBUG 04-22 01:06:21 [model_executor/.../runner/default_moe_runner.py:240] Enabled separate cuda stream for MoE shared_experts -(EngineCore pid=243) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=243) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=243) DEBUG 04-22 01:06:21 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=243) DEBUG 04-22 01:06:21 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=243) DEBUG 04-22 01:06:21 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 193, 'fused_moe': 48, 'unquantized_fused_moe': 48, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=243) DEBUG 04-22 01:06:21 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=243) DEBUG 04-22 01:06:21 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00013-of-00016.safetensors', 'model-00005-of-00016.safetensors', 'model-00004-of-00016.safetensors', 'model-00007-of-00016.safetensors', 'model-00006-of-00016.safetensors', 'model-00016-of-00016.safetensors', 'model-00009-of-00016.safetensors', 'model-00003-of-00016.safetensors', 'model-00002-of-00016.safetensors', 'model-00015-of-00016.safetensors', 'model-00010-of-00016.safetensors', 'model-00011-of-00016.safetensors', 'model-00001-of-00016.safetensors', 'model-00012-of-00016.safetensors', 'model-00008-of-00016.safetensors', 'model-00014-of-00016.safetensors']] -(EngineCore pid=243) Loading safetensors checkpoint shards: 0% Completed | 0/16 [00:00 -(APIServer pid=1) DEBUG 04-22 01:07:19 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/forward_context.py -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/config.py -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/fused_moe_method_base.py -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/layer.py -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/runner/default_moe_runner.py -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/shared_fused_moe.py -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3_moe.py -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=243) INFO 04-22 01:07:24 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/2fc27d8a46/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=496d7d5de7 comp=e546579c48 code=691aa59361ebbac2850a48ed2c9a2c8014c83c0f1cd12fbe5f19c3ebb373a13e dir=/data/.cache/vllm/torch_compile_cache/2fc27d8a46/rank_0_0/backbone -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/backends.py:1074] Vllm config hash: 496d7d5de7 -(EngineCore pid=243) INFO 04-22 01:07:24 [compilation/backends.py:1111] Dynamo bytecode transform time: 6.86 s -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=243) DEBUG 04-22 01:07:24 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=243) DEBUG 04-22 01:07:26 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 01:07:26 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(EngineCore pid=243) DEBUG 04-22 01:07:26 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms -(EngineCore pid=243) DEBUG 04-22 01:07:26 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 01:07:26 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(APIServer pid=1) DEBUG 04-22 01:07:29 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=243) INFO 04-22 01:07:29 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=243) DEBUG 04-22 01:07:29 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/2fc27d8a46/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=243) DEBUG 04-22 01:07:30 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 01:07:30 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(EngineCore pid=243) DEBUG 04-22 01:07:30 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms -(EngineCore pid=243) DEBUG 04-22 01:07:30 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 01:07:30 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=243) DEBUG 04-22 01:07:31 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/2fc27d8a46/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=243) DEBUG 04-22 01:07:32 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 01:07:32 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(EngineCore pid=243) DEBUG 04-22 01:07:32 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.7 ms -(EngineCore pid=243) DEBUG 04-22 01:07:32 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 01:07:32 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(EngineCore pid=243) DEBUG 04-22 01:07:32 [compilation/backends.py:377] Store the 48-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_48', '/data/.cache/vllm/torch_compile_cache/2fc27d8a46/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_48') -(EngineCore pid=243) INFO 04-22 01:07:32 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 7.91 s -(EngineCore pid=243) DEBUG 04-22 01:07:33 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/2fc27d8a46/rank_0_0/backbone/computation_graph.py -(EngineCore pid=243) INFO 04-22 01:07:34 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/aa66eecb906a0de896d6e1e95b50437f56fffa80eec8e3863e8ea6c4af7925d3/rank_0_0/model -(EngineCore pid=243) INFO 04-22 01:07:34 [compilation/monitor.py:48] torch.compile took 17.11 s in total -(EngineCore pid=243) WARNING 04-22 01:07:35 [model_executor/.../fused_moe/fused_moe.py:1090] Using default MoE config. Performance might be sub-optimal! Config file not found at /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/fused_moe/configs/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json -(EngineCore pid=243) INFO 04-22 01:07:35 [compilation/monitor.py:76] Initial profiling/warmup run took 1.08 s -(APIServer pid=1) DEBUG 04-22 01:07:39 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=243) INFO 04-22 01:07:41 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=243) DEBUG 04-22 01:07:41 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=243) INFO 04-22 01:07:41 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=243) DEBUG 04-22 01:07:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:07:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:07:42 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 01:07:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:07:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:07:42 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 01:07:42 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 108.00 MiB first-capture + (51-1) × 10.00 MiB per-graph -(EngineCore pid=243) DEBUG 04-22 01:07:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:07:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:07:42 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 01:07:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:07:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:07:42 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 01:07:42 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 266.00 MiB first-capture + (51-1) × 10.00 MiB per-graph -(EngineCore pid=243) DEBUG 04-22 01:07:43 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=243) INFO 04-22 01:07:43 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.24 GiB total -(EngineCore pid=243) DEBUG 04-22 01:07:43 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=243) DEBUG 04-22 01:07:43 [v1/worker/gpu_worker.py:430] Free memory after profiling: 20.74 GiB (total), 17.29 GiB (within requested) -(EngineCore pid=243) DEBUG 04-22 01:07:43 [v1/worker/gpu_worker.py:435] Memory profiling takes 25.66 seconds. Total non KV cache memory: 59.82GiB; torch peak memory increase: 2.68GiB; non-torch forward increase memory: 0.27GiB; weights memory: 56.88GiB. -(EngineCore pid=243) INFO 04-22 01:07:43 [v1/worker/gpu_worker.py:436] Available KV cache memory: 15.41 GiB -(EngineCore pid=243) INFO 04-22 01:07:43 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9656 to maintain the same effective KV cache size. -(EngineCore pid=243) INFO 04-22 01:07:43 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 168,288 tokens -(EngineCore pid=243) INFO 04-22 01:07:43 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 20.54x -(EngineCore pid=243) 2026-04-22 01:07:43,491 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=243) DEBUG 04-22 01:07:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) 2026-04-22 01:07:43,521 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=243) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:04:39 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:04:39 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 01:04:39 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:04:39 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 01:04:39 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 01:04:39 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model Qwen/Qwen3-8B -(APIServer pid=1) INFO 04-22 01:04:39 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 01:04:39 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:04:39 [entrypoints/utils.py:233] non-default args: {'model_tag': 'Qwen/Qwen3-8B', 'model': 'Qwen/Qwen3-8B', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 01:04:39 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 01:04:39 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen3.Qwen3ForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 01:04:39 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0006517 secs -(APIServer pid=1) INFO 04-22 01:04:39 [config/model.py:549] Resolved architecture: Qwen3ForCausalLM -(APIServer pid=1) INFO 04-22 01:04:39 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 01:04:39 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 01:04:39 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 01:04:39 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 01:04:39 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 01:04:39 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 01:04:39 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 01:04:39 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 01:04:39 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 01:04:39 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:04:40 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:04:40 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 01:04:44 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:04:44 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:04:44 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:04:44 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:04:44 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:04:44 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:04:44 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:04:44 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:04:44 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:04:44 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:04:44 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:04:44 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:04:49 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(APIServer pid=1) DEBUG 04-22 01:04:50 [v1/engine/utils.py:1042] Waiting for 1 local, 0 remote core engine proc(s) to connect. -(EngineCore pid=243) DEBUG 04-22 01:04:50 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 01:04:50 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=243) DEBUG 04-22 01:04:50 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/db739ab4-58eb-4aeb-b654-24c2702f39ba'], outputs=['ipc:///tmp/c68fa860-9ede-4935-81bc-ed8c6b4d4677'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=243) DEBUG 04-22 01:04:50 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=243) DEBUG 04-22 01:04:50 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=243) DEBUG 04-22 01:04:50 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=243) DEBUG 04-22 01:04:50 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=243) DEBUG 04-22 01:04:50 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=243) INFO 04-22 01:04:50 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='Qwen/Qwen3-8B', speculative_config=None, tokenizer='Qwen/Qwen3-8B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=Qwen/Qwen3-8B, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=243) DEBUG 04-22 01:04:51 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.7.35:33327 backend=nccl -(EngineCore pid=243) INFO 04-22 01:04:51 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.7.35:33327 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=243) DEBUG 04-22 01:04:51 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=243) INFO 04-22 01:04:51 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=243) DEBUG 04-22 01:04:51 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776819891.829098, auto_measure=True -(EngineCore pid=243) DEBUG 04-22 01:04:51 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=243) DEBUG 04-22 01:04:51 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=243) DEBUG 04-22 01:04:51 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=243) DEBUG 04-22 01:04:51 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=243) DEBUG 04-22 01:04:51 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=243) DEBUG 04-22 01:04:51 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=243) DEBUG 04-22 01:04:51 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=243) DEBUG 04-22 01:04:52 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=243) INFO 04-22 01:04:52 [v1/worker/gpu_model_runner.py:4735] Starting to load model Qwen/Qwen3-8B... -(EngineCore pid=243) DEBUG 04-22 01:04:52 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=243) INFO 04-22 01:04:52 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=243) INFO 04-22 01:04:52 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=243) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=243) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=243) DEBUG 04-22 01:04:52 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=243) DEBUG 04-22 01:04:52 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=243) DEBUG 04-22 01:04:52 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=243) DEBUG 04-22 01:04:52 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 145, 'silu_and_mul': 36, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=243) DEBUG 04-22 01:04:52 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=243) DEBUG 04-22 01:04:53 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00005-of-00005.safetensors', 'model-00003-of-00005.safetensors', 'model-00004-of-00005.safetensors', 'model-00002-of-00005.safetensors', 'model-00001-of-00005.safetensors']] -(EngineCore pid=243) Loading safetensors checkpoint shards: 0% Completed | 0/5 [00:00 -(APIServer pid=1) DEBUG 04-22 01:05:10 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/utils.py -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen3.py -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/parameter.py -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=243) INFO 04-22 01:05:13 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/709aca0718/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=500ddfe46b comp=e546579c48 code=318216e61e692fc38536a6acea806e2e636550278ab73e70427008e98d48a0f3 dir=/data/.cache/vllm/torch_compile_cache/709aca0718/rank_0_0/backbone -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/backends.py:1074] Vllm config hash: 500ddfe46b -(EngineCore pid=243) INFO 04-22 01:05:13 [compilation/backends.py:1111] Dynamo bytecode transform time: 5.60 s -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=243) DEBUG 04-22 01:05:13 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=243) DEBUG 04-22 01:05:14 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 01:05:14 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(EngineCore pid=243) DEBUG 04-22 01:05:14 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.0 ms -(EngineCore pid=243) DEBUG 04-22 01:05:14 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 01:05:14 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=243) INFO 04-22 01:05:18 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=243) DEBUG 04-22 01:05:18 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/709aca0718/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=243) DEBUG 04-22 01:05:18 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 01:05:18 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(EngineCore pid=243) DEBUG 04-22 01:05:18 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.3 ms -(EngineCore pid=243) DEBUG 04-22 01:05:18 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 01:05:18 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(APIServer pid=1) DEBUG 04-22 01:05:20 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=243) DEBUG 04-22 01:05:21 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/709aca0718/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=243) DEBUG 04-22 01:05:22 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 01:05:22 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms -(EngineCore pid=243) DEBUG 04-22 01:05:22 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms -(EngineCore pid=243) DEBUG 04-22 01:05:22 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 01:05:22 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(EngineCore pid=243) DEBUG 04-22 01:05:22 [compilation/backends.py:377] Store the 36-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_36', '/data/.cache/vllm/torch_compile_cache/709aca0718/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_36') -(EngineCore pid=243) INFO 04-22 01:05:22 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 8.58 s -(EngineCore pid=243) DEBUG 04-22 01:05:22 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/709aca0718/rank_0_0/backbone/computation_graph.py -(EngineCore pid=243) INFO 04-22 01:05:23 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/1b1f0768e3b39bb5337d62f50b4457b3465da85a5edcee9089ee51445148b822/rank_0_0/model -(EngineCore pid=243) INFO 04-22 01:05:23 [compilation/monitor.py:48] torch.compile took 15.63 s in total -(EngineCore pid=243) INFO 04-22 01:05:24 [compilation/monitor.py:76] Initial profiling/warmup run took 0.56 s -(EngineCore pid=243) INFO 04-22 01:05:29 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=243) DEBUG 04-22 01:05:29 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=243) INFO 04-22 01:05:29 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=243) DEBUG 04-22 01:05:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:05:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:05:30 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 01:05:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:05:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:05:30 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 01:05:30 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 130.00 MiB first-capture + (51-1) × 8.00 MiB per-graph -(EngineCore pid=243) DEBUG 04-22 01:05:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:05:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:05:30 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 01:05:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:05:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 01:05:30 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 01:05:30 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 262.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(APIServer pid=1) DEBUG 04-22 01:05:30 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=243) DEBUG 04-22 01:05:30 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=243) INFO 04-22 01:05:30 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.84 GiB total -(EngineCore pid=243) DEBUG 04-22 01:05:31 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=243) DEBUG 04-22 01:05:31 [v1/worker/gpu_worker.py:430] Free memory after profiling: 61.84 GiB (total), 58.39 GiB (within requested) -(EngineCore pid=243) DEBUG 04-22 01:05:31 [v1/worker/gpu_worker.py:435] Memory profiling takes 23.00 seconds. Total non KV cache memory: 17.73GiB; torch peak memory increase: 2.21GiB; non-torch forward increase memory: 0.25GiB; weights memory: 15.27GiB. -(EngineCore pid=243) INFO 04-22 01:05:31 [v1/worker/gpu_worker.py:436] Available KV cache memory: 57.5 GiB -(EngineCore pid=243) INFO 04-22 01:05:31 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9606 to maintain the same effective KV cache size. -(EngineCore pid=243) INFO 04-22 01:05:31 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 418,688 tokens -(EngineCore pid=243) INFO 04-22 01:05:31 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 51.11x -(EngineCore pid=243) 2026-04-22 01:05:31,167 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=243) DEBUG 04-22 01:05:31 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) 2026-04-22 01:05:31,180 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=243) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:55:22 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:55:22 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 01:55:22 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:55:22 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 01:55:22 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 01:55:22 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model redhatai/Llama-3.3-70B-Instruct-quantized.w8a8 -(APIServer pid=1) INFO 04-22 01:55:22 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 01:55:22 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 01:55:22 [entrypoints/utils.py:233] non-default args: {'model_tag': 'redhatai/Llama-3.3-70B-Instruct-quantized.w8a8', 'model': 'redhatai/Llama-3.3-70B-Instruct-quantized.w8a8', 'max_model_len': 8192, 'tensor_parallel_size': 2, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 01:55:22 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 01:55:23 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 01:55:23 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003789 secs -(APIServer pid=1) INFO 04-22 01:55:23 [config/model.py:549] Resolved architecture: LlamaForCausalLM -(APIServer pid=1) INFO 04-22 01:55:23 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 01:55:23 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 01:55:23 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 01:55:23 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 01:55:23 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 01:55:23 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 01:55:23 [config/parallel.py:743] Defaulting to use mp for distributed inference -(APIServer pid=1) INFO 04-22 01:55:23 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 01:55:23 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(APIServer pid=1) INFO 04-22 01:55:24 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(APIServer pid=1) DEBUG 04-22 01:55:24 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 01:55:24 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:55:25 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 01:55:25 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 01:55:28 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:55:28 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:55:28 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:55:28 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:55:28 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:55:28 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:55:28 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:55:28 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:55:28 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:55:28 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:55:28 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:55:28 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:55:33 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=244) DEBUG 04-22 01:55:35 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 01:55:35 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=244) DEBUG 04-22 01:55:35 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/1701c665-d91f-4876-9ebb-bf4915d36362'], outputs=['ipc:///tmp/7ea2c92f-1e51-48db-9181-6033967a2814'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=244) DEBUG 04-22 01:55:35 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=244) DEBUG 04-22 01:55:35 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=244) DEBUG 04-22 01:55:35 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=244) DEBUG 04-22 01:55:35 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=244) DEBUG 04-22 01:55:35 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=244) INFO 04-22 01:55:35 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='redhatai/Llama-3.3-70B-Instruct-quantized.w8a8', speculative_config=None, tokenizer='redhatai/Llama-3.3-70B-Instruct-quantized.w8a8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=redhatai/Llama-3.3-70B-Instruct-quantized.w8a8, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [4096, 8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=244) WARNING 04-22 01:55:35 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. -(EngineCore pid=244) INFO 04-22 01:55:35 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.131.3.126 (local), world_size=2, local_world_size=2 -(EngineCore pid=244) DEBUG 04-22 01:55:35 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/ea3f9a82-825d-40ce-bbcc-2ddc6e815723 -(EngineCore pid=244) DEBUG 04-22 01:55:35 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1], buffer_handle=(2, 16777216, 10, 'psm_8477034d'), local_subscribe_addr='ipc:///tmp/ea3f9a82-825d-40ce-bbcc-2ddc6e815723', local_notify_addr='ipc:///tmp/ce075078-4372-434e-85ca-c7a0d0052cd6', remote_subscribe_addr=None, remote_addr_ipv6=False) -DEBUG 04-22 01:55:38 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:55:38 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:55:38 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:55:38 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:55:38 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:55:38 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:55:38 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:55:38 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:55:38 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:55:38 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:55:38 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:55:38 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:55:38 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 01:55:38 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 01:55:38 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 01:55:38 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:55:38 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:55:38 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 01:55:38 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 01:55:38 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 01:55:38 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 01:55:38 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 01:55:38 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 01:55:38 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 01:55:43 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:55:43 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 01:55:45 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:55:45 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:55:45 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:55:45 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 01:55:45 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 01:55:45 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 01:55:45 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 01:55:45 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) DEBUG 04-22 01:55:45 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker pid=443) DEBUG 04-22 01:55:45 [distributed/parallel_state.py:1356] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:58613 backend=nccl -(Worker pid=443) INFO 04-22 01:55:45 [distributed/parallel_state.py:1400] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:58613 backend=nccl -(Worker pid=444) DEBUG 04-22 01:55:45 [distributed/parallel_state.py:1356] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:58613 backend=nccl -(Worker pid=444) INFO 04-22 01:55:45 [distributed/parallel_state.py:1400] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:58613 backend=nccl -[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -(Worker pid=444) DEBUG 04-22 01:55:45 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=443) DEBUG 04-22 01:55:45 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -(Worker pid=443) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=444) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=443) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=444) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=443) DEBUG 04-22 01:55:46 [utils/nccl.py:34] Found nccl from library libnccl.so.2 -(Worker pid=443) INFO 04-22 01:55:46 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 -(Worker pid=443) DEBUG 04-22 01:55:46 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=444) DEBUG 04-22 01:55:46 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=443) DEBUG 04-22 01:55:46 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/08771257-3d22-4234-b496-55ad1ccdf616 -(Worker pid=443) DEBUG 04-22 01:55:46 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1], buffer_handle=(1, 4194304, 6, 'psm_6004ee6f'), local_subscribe_addr='ipc:///tmp/08771257-3d22-4234-b496-55ad1ccdf616', local_notify_addr='ipc:///tmp/acdfd856-dfd1-46a2-8bd0-c53fd5031672', remote_subscribe_addr=None, remote_addr_ipv6=False) -(Worker pid=444) DEBUG 04-22 01:55:46 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/08771257-3d22-4234-b496-55ad1ccdf616 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(Worker pid=443) INFO 04-22 01:55:46 [distributed/parallel_state.py:1716] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(Worker pid=443) DEBUG 04-22 01:55:47 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.66GiB, total_memory=79.19GiB, cuda_memory=1.53GiB, torch_memory=0.02GiB, non_torch_memory=1.51GiB, timestamp=1776822947.147593, auto_measure=True -(Worker pid=443) DEBUG 04-22 01:55:47 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=444) DEBUG 04-22 01:55:47 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.66GiB, total_memory=79.19GiB, cuda_memory=1.53GiB, torch_memory=0.02GiB, non_torch_memory=1.51GiB, timestamp=1776822947.1658504, auto_measure=True -(Worker pid=444) DEBUG 04-22 01:55:47 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=443) DEBUG 04-22 01:55:47 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=443) DEBUG 04-22 01:55:47 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=443) DEBUG 04-22 01:55:47 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=444) DEBUG 04-22 01:55:47 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=444) DEBUG 04-22 01:55:47 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=444) DEBUG 04-22 01:55:47 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=443) DEBUG 04-22 01:55:47 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=443) DEBUG 04-22 01:55:47 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(Worker pid=443) DEBUG 04-22 01:55:47 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=444) DEBUG 04-22 01:55:47 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=444) DEBUG 04-22 01:55:47 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=443) DEBUG 04-22 01:55:47 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(Worker_TP0 pid=443) INFO 04-22 01:55:47 [v1/worker/gpu_model_runner.py:4735] Starting to load model redhatai/Llama-3.3-70B-Instruct-quantized.w8a8... -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.0.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.0.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.0.self_attn.qkv_proj -(Worker_TP0 pid=443) INFO 04-22 01:55:47 [model_executor/.../linear/__init__.py:291] Selected CutlassInt8ScaledMMLinearKernel for CompressedTensorsW8A8Int8 -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.0.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.0.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(Worker_TP0 pid=443) INFO 04-22 01:55:47 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(Worker_TP0 pid=443) INFO 04-22 01:55:47 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.0.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.0.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.0.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.1.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.1.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.1.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.1.self_attn.qkv_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.1.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.1.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.1.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.1.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.2.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.2.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.2.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.2.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.2.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.2.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.2.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.2.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.3.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.3.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.3.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.3.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.3.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.3.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.3.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.3.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.4.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.4.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.4.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.4.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.4.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.4.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.4.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.4.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.5.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.5.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.5.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.5.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.5.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.5.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.5.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.5.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.6.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.6.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.6.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.6.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.6.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.6.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.6.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.6.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.7.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.7.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.7.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.7.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.7.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.7.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.7.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.7.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.8.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.8.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.8.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.8.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.8.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.8.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.8.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.8.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.9.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.9.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.9.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.9.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.9.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.9.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.9.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.9.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.10.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.10.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.10.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.10.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.10.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.10.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.10.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.10.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.11.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.11.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.11.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.11.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.11.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.11.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.11.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.11.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.12.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.12.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.12.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.12.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.12.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.12.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.12.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.12.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.13.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.13.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.13.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.13.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.13.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.13.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.13.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.13.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.14.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.14.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.14.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.14.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.14.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.14.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.14.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.14.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.15.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.15.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.15.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.15.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.15.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.15.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.15.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.15.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.16.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.16.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.16.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.16.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.16.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.16.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.16.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.16.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.17.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.17.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.17.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.17.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.17.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.17.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.17.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.17.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.18.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.18.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.18.self_attn.qkv_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.18.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.18.mlp.gate_up_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.18.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.18.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.18.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.19.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.19.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.19.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.19.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.19.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.19.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.19.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.19.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.20.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.20.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.20.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.20.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.20.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.20.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.20.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.20.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.21.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.21.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.21.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.21.mlp.gate_up_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.21.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.21.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.21.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.21.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.22.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.22.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.22.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.22.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.22.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.22.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.22.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.22.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.23.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.23.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.23.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.23.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.23.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.23.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.23.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.23.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.24.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.24.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.24.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.24.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.24.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.24.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.24.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.24.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.25.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.25.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.25.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.25.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.25.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.25.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.25.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.25.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.26.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.26.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.26.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.26.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.26.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.26.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.26.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.26.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.27.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.27.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.27.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.27.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.27.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.27.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.27.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.27.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.28.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.28.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.28.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.28.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.28.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.28.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.28.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.28.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.29.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.29.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.29.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.29.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.29.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.29.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.29.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.29.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.30.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.30.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.30.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.30.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.30.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.30.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.30.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.30.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.31.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.31.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.31.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.31.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.31.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.31.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.31.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.31.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.32.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.32.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.32.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.32.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.32.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.32.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.32.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.32.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.33.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.33.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.33.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.33.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.33.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.33.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.33.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.33.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.34.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.34.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.34.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.34.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.34.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.34.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.34.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.34.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.35.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.35.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.35.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.35.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.35.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.35.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.35.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.35.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.36.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.36.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.36.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.36.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.36.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.36.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.36.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.36.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.37.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.37.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.37.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.37.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.37.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.37.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.37.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.37.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.38.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.38.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.38.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.38.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.38.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.38.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.38.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.38.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.39.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.39.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.39.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.39.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.39.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.39.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.39.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.39.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.40.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.40.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.40.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.40.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.40.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.40.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.40.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.40.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.41.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.41.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.41.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.41.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.41.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.41.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.41.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.41.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.42.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.42.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.42.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.42.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.42.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.42.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.42.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.42.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.43.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.43.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.43.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.43.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.43.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.43.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.43.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.43.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.44.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.44.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.44.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.44.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.44.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.44.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.44.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.44.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.45.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.45.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.45.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.45.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.45.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.45.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.45.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.45.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.46.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.46.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.46.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.46.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.46.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.46.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.46.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.46.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.47.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.47.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.47.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.47.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.47.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.47.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.47.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.47.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.48.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.48.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.48.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.48.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.48.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.48.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.48.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.48.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.49.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.49.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.49.mlp.gate_up_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.49.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.49.self_attn.qkv_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.49.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.49.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.49.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.50.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.50.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.50.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.50.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.50.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.50.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.50.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.50.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.51.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.51.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.51.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.51.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.51.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.51.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.51.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.51.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.52.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.52.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.52.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.52.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.52.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.52.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.52.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.52.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.53.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.53.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.53.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.53.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.53.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.53.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.53.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.53.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.54.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.54.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.54.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.54.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.54.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.54.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.54.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.54.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.55.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.55.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.55.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.55.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.55.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.55.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.55.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.55.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.56.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.56.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.56.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.56.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.56.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.56.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.56.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.56.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.57.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.57.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.57.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.57.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.57.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.57.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.57.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.57.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.58.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.58.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.58.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.58.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.58.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.58.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.58.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.58.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.59.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.59.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.59.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.59.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.59.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.59.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.59.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.59.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.60.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.60.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.60.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.60.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.60.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.60.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.60.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.60.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.61.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.61.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.61.mlp.gate_up_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.61.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.61.self_attn.qkv_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.61.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.61.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.61.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.62.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.62.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.62.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.62.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.62.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.62.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.62.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.62.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.63.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.63.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.63.mlp.gate_up_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.63.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.63.self_attn.qkv_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.63.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.63.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.63.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.64.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.64.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.64.mlp.gate_up_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.64.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.64.self_attn.qkv_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.64.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.64.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.64.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.65.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.65.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.65.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.65.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.65.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.65.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.65.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.65.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.66.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.66.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.66.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.66.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.66.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.66.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.66.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.66.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.67.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.67.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.67.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.67.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.67.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.67.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.67.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.67.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.68.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:47 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.68.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.68.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.68.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.68.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.68.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.68.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.68.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.69.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.69.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.69.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.69.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.69.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.69.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.69.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.69.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.70.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.70.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.70.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.70.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.70.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.70.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.70.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.70.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.71.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.71.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.71.mlp.gate_up_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.71.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.71.self_attn.qkv_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.71.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.71.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.71.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.72.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.72.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.72.mlp.gate_up_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.72.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.72.self_attn.qkv_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.72.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.72.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.72.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.73.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.73.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.73.mlp.gate_up_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.73.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.73.self_attn.qkv_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.73.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.73.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.73.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.74.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.74.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.74.mlp.gate_up_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.74.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.74.self_attn.qkv_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.74.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.74.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.74.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.75.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.75.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.75.mlp.gate_up_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.75.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.75.self_attn.qkv_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.75.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.75.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.75.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.76.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.76.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.76.mlp.gate_up_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.76.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.76.self_attn.qkv_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.76.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.76.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.76.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.77.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.77.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.77.mlp.gate_up_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.77.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.77.self_attn.qkv_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.77.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.77.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.77.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.78.self_attn.qkv_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.78.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.78.mlp.gate_up_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.78.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.78.self_attn.qkv_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.78.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.78.mlp.gate_up_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.79.self_attn.qkv_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.78.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.79.self_attn.o_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.79.mlp.gate_up_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.79.mlp.down_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.79.self_attn.qkv_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.79.self_attn.o_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.79.mlp.gate_up_proj -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.79.mlp.down_proj -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 161, 'silu_and_mul': 80, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP0 pid=443) DEBUG 04-22 01:55:48 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00007-of-00015.safetensors', 'model-00003-of-00015.safetensors', 'model-00001-of-00015.safetensors', 'model-00014-of-00015.safetensors', 'model-00009-of-00015.safetensors', 'model-00015-of-00015.safetensors', 'model-00010-of-00015.safetensors', 'model-00006-of-00015.safetensors', 'model-00002-of-00015.safetensors', 'model-00012-of-00015.safetensors', 'model-00008-of-00015.safetensors', 'model-00013-of-00015.safetensors', 'model-00005-of-00015.safetensors', 'model-00004-of-00015.safetensors', 'model-00011-of-00015.safetensors']] -(Worker_TP1 pid=444) DEBUG 04-22 01:55:48 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00005-of-00015.safetensors', 'model-00001-of-00015.safetensors', 'model-00008-of-00015.safetensors', 'model-00009-of-00015.safetensors', 'model-00011-of-00015.safetensors', 'model-00012-of-00015.safetensors', 'model-00007-of-00015.safetensors', 'model-00002-of-00015.safetensors', 'model-00015-of-00015.safetensors', 'model-00010-of-00015.safetensors', 'model-00006-of-00015.safetensors', 'model-00014-of-00015.safetensors', 'model-00013-of-00015.safetensors', 'model-00003-of-00015.safetensors', 'model-00004-of-00015.safetensors']] -(Worker_TP0 pid=443) Loading safetensors checkpoint shards: 0% Completed | 0/15 [00:00 -(Worker_TP1 pid=444) DEBUG 04-22 01:56:08 [compilation/decorators.py:528] Start compiling function -(EngineCore pid=244) DEBUG 04-22 01:56:09 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(APIServer pid=1) DEBUG 04-22 01:56:15 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP0 pid=443) INFO 04-22 01:56:21 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/71aef8fca1/rank_0_0/backbone for vLLM's torch.compile -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=ad833cdb21 comp=e546579c48 code=00c436b0deda272393bbd56b49f2a57f076817aa66601ed815250cff678fbbf0 dir=/data/.cache/vllm/torch_compile_cache/71aef8fca1/rank_0_0/backbone -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] Vllm config hash: ad833cdb21 -(Worker_TP0 pid=443) INFO 04-22 01:56:21 [compilation/backends.py:1111] Dynamo bytecode transform time: 12.42 s -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [compilation/.../fusion/allreduce_rms_fusion.py:765] Flashinfer max size: 64 MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: 4096 -(Worker_TP0 pid=443) INFO 04-22 01:56:21 [distributed/device_communicators/flashinfer_all_reduce.py:109] Auto-selected flashinfer allreduce backend: trtllm -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=ad833cdb21 comp=e546579c48 code=00c436b0deda272393bbd56b49f2a57f076817aa66601ed815250cff678fbbf0 dir=/data/.cache/vllm/torch_compile_cache/71aef8fca1/rank_1_0/backbone -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [compilation/backends.py:1074] Vllm config hash: ad833cdb21 -(Worker_TP0 pid=443) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. -(Worker_TP0 pid=443) return func(*args, **kwargs) -(Worker_TP1 pid=444) DEBUG 04-22 01:56:21 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=1, max_token_num=4096, hidden_dim=8192, dtype=torch.bfloat16 -(Worker_TP0 pid=443) DEBUG 04-22 01:56:21 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=0, max_token_num=4096, hidden_dim=8192, dtype=torch.bfloat16 -(Worker_TP0 pid=443) INFO 04-22 01:56:21 [distributed/device_communicators/flashinfer_all_reduce.py:149] Initialized FlashInfer Allreduce norm fusion workspace with backend=trtllm -(Worker_TP0 pid=443) DEBUG 04-22 01:56:23 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 4096), (4097, 8192)] -(Worker_TP0 pid=443) DEBUG 04-22 01:56:23 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(Worker_TP0 pid=443) DEBUG 04-22 01:56:24 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=443) DEBUG 04-22 01:56:24 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:56:24 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns -(Worker_TP0 pid=443) DEBUG 04-22 01:56:24 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 23.0 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:56:24 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:56:24 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes -(Worker_TP0 pid=443) DEBUG 04-22 01:56:24 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:56:24 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=444) DEBUG 04-22 01:56:24 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:56:24 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 1 patterns -(Worker_TP1 pid=444) DEBUG 04-22 01:56:24 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 23.5 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:56:24 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:56:24 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 1 nodes, removed 3 nodes -(Worker_TP1 pid=444) DEBUG 04-22 01:56:24 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(APIServer pid=1) DEBUG 04-22 01:56:25 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=443) INFO 04-22 01:56:26 [compilation/backends.py:372] Cache the graph of compile range (1, 4096) for later use -(Worker_TP0 pid=443) DEBUG 04-22 01:56:26 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 4096) from inductor_standalone via handle ('artifact_compile_range_1_4096_subgraph_0', '/data/.cache/vllm/torch_compile_cache/71aef8fca1/rank_0_0/backbone/artifact_compile_range_1_4096_subgraph_0') -(Worker_TP0 pid=443) DEBUG 04-22 01:56:26 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=443) DEBUG 04-22 01:56:26 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:56:26 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) -(Worker_TP0 pid=443) DEBUG 04-22 01:56:26 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.7 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:56:26 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=443) DEBUG 04-22 01:56:26 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:56:27 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=444) DEBUG 04-22 01:56:27 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:56:27 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) -(Worker_TP1 pid=444) DEBUG 04-22 01:56:27 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:56:27 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=444) DEBUG 04-22 01:56:27 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=443) INFO 04-22 01:56:27 [compilation/backends.py:372] Cache the graph of compile range (4097, 8192) for later use -(Worker_TP0 pid=443) DEBUG 04-22 01:56:27 [compilation/backends.py:377] Store the 0-th graph for compile range(4097, 8192) from inductor_standalone via handle ('artifact_compile_range_4097_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/71aef8fca1/rank_0_0/backbone/artifact_compile_range_4097_8192_subgraph_0') -(Worker_TP0 pid=443) DEBUG 04-22 01:56:28 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=443) DEBUG 04-22 01:56:28 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:56:28 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP0 pid=443) DEBUG 04-22 01:56:28 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 38.7 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:56:28 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:56:28 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP0 pid=443) DEBUG 04-22 01:56:28 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:56:28 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=444) DEBUG 04-22 01:56:28 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:56:28 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP1 pid=444) DEBUG 04-22 01:56:28 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 37.1 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:56:28 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:56:28 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP1 pid=444) DEBUG 04-22 01:56:28 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:56:29 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 4096) from inductor_standalone via handle ('artifact_compile_range_1_4096_subgraph_1', '/data/.cache/vllm/torch_compile_cache/71aef8fca1/rank_0_0/backbone/artifact_compile_range_1_4096_subgraph_1') -(Worker_TP0 pid=443) DEBUG 04-22 01:56:29 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=443) DEBUG 04-22 01:56:29 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:56:29 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) -(Worker_TP0 pid=443) DEBUG 04-22 01:56:29 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:56:29 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=443) DEBUG 04-22 01:56:29 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:56:29 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=444) DEBUG 04-22 01:56:29 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.9 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:56:29 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) -(Worker_TP1 pid=444) DEBUG 04-22 01:56:29 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:56:29 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=444) DEBUG 04-22 01:56:29 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:56:31 [compilation/backends.py:377] Store the 1-th graph for compile range(4097, 8192) from inductor_standalone via handle ('artifact_compile_range_4097_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/71aef8fca1/rank_0_0/backbone/artifact_compile_range_4097_8192_subgraph_1') -(APIServer pid=1) DEBUG 04-22 01:56:35 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=443) DEBUG 04-22 01:56:37 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=443) DEBUG 04-22 01:56:37 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:56:37 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP0 pid=443) DEBUG 04-22 01:56:37 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 38.8 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:56:37 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:56:37 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP0 pid=443) DEBUG 04-22 01:56:37 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:56:38 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=444) DEBUG 04-22 01:56:38 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:56:38 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP1 pid=444) DEBUG 04-22 01:56:38 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 38.1 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:56:38 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:56:38 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP1 pid=444) DEBUG 04-22 01:56:38 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:56:38 [compilation/backends.py:377] Store the 80-th graph for compile range(1, 4096) from inductor_standalone via handle ('artifact_compile_range_1_4096_subgraph_80', '/data/.cache/vllm/torch_compile_cache/71aef8fca1/rank_0_0/backbone/artifact_compile_range_1_4096_subgraph_80') -(Worker_TP0 pid=443) INFO 04-22 01:56:38 [compilation/backends.py:390] Compiling a graph for compile range (1, 4096) takes 11.64 s -(Worker_TP0 pid=443) DEBUG 04-22 01:56:38 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=443) DEBUG 04-22 01:56:38 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:56:38 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) -(Worker_TP0 pid=443) DEBUG 04-22 01:56:38 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:56:38 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=443) DEBUG 04-22 01:56:38 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:56:38 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=444) DEBUG 04-22 01:56:38 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:56:38 [compilation/passes/pass_manager.py:100] Skipping with compile range (4097, 8192) -(Worker_TP1 pid=444) DEBUG 04-22 01:56:38 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=444) DEBUG 04-22 01:56:38 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=444) DEBUG 04-22 01:56:38 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP0 pid=443) DEBUG 04-22 01:56:39 [compilation/backends.py:377] Store the 80-th graph for compile range(4097, 8192) from inductor_standalone via handle ('artifact_compile_range_4097_8192_subgraph_80', '/data/.cache/vllm/torch_compile_cache/71aef8fca1/rank_0_0/backbone/artifact_compile_range_4097_8192_subgraph_80') -(Worker_TP0 pid=443) INFO 04-22 01:56:39 [compilation/backends.py:390] Compiling a graph for compile range (4097, 8192) takes 12.71 s -(Worker_TP0 pid=443) DEBUG 04-22 01:56:39 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/71aef8fca1/rank_0_0/backbone/computation_graph.py -(Worker_TP0 pid=443) INFO 04-22 01:56:42 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/eac9c02a0de983743bee948426f8d57256d1535569de6e48b0a78e1093f03a5a/rank_0_0/model -(Worker_TP0 pid=443) INFO 04-22 01:56:42 [compilation/monitor.py:48] torch.compile took 33.81 s in total -(Worker_TP0 pid=443) INFO 04-22 01:56:44 [compilation/monitor.py:76] Initial profiling/warmup run took 1.92 s -(APIServer pid=1) DEBUG 04-22 01:56:45 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=443) INFO 04-22 01:56:50 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP0 pid=443) DEBUG 04-22 01:56:50 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP0 pid=443) INFO 04-22 01:56:50 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_TP1 pid=444) INFO 04-22 01:56:50 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP1 pid=444) DEBUG 04-22 01:56:50 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP1 pid=444) INFO 04-22 01:56:50 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_TP0 pid=443) DEBUG 04-22 01:56:50 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=444) DEBUG 04-22 01:56:50 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=443) DEBUG 04-22 01:56:50 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=443) DEBUG 04-22 01:56:50 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=444) DEBUG 04-22 01:56:50 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=444) DEBUG 04-22 01:56:50 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=443) DEBUG 04-22 01:56:51 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=444) DEBUG 04-22 01:56:51 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=443) DEBUG 04-22 01:56:51 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=443) DEBUG 04-22 01:56:51 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=444) DEBUG 04-22 01:56:51 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=444) DEBUG 04-22 01:56:51 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=443) DEBUG 04-22 01:56:51 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 122.00 MiB first-capture + (51-1) × 20.00 MiB per-graph -(Worker_TP0 pid=443) DEBUG 04-22 01:56:51 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=444) DEBUG 04-22 01:56:51 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 122.00 MiB first-capture + (51-1) × 20.00 MiB per-graph -(Worker_TP1 pid=444) DEBUG 04-22 01:56:51 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=443) DEBUG 04-22 01:56:51 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=443) DEBUG 04-22 01:56:51 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=444) DEBUG 04-22 01:56:51 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=444) DEBUG 04-22 01:56:51 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=444) DEBUG 04-22 01:56:51 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=443) DEBUG 04-22 01:56:51 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=444) DEBUG 04-22 01:56:51 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=443) DEBUG 04-22 01:56:51 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=444) DEBUG 04-22 01:56:51 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=443) DEBUG 04-22 01:56:51 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=444) DEBUG 04-22 01:56:51 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 268.00 MiB first-capture + (51-1) × 14.00 MiB per-graph -(Worker_TP1 pid=444) INFO 04-22 01:56:51 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses -(Worker_TP0 pid=443) DEBUG 04-22 01:56:51 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 268.00 MiB first-capture + (51-1) × 14.00 MiB per-graph -(Worker_TP0 pid=443) INFO 04-22 01:56:51 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses -(Worker_TP0 pid=443) DEBUG 04-22 01:56:52 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP0 pid=443) INFO 04-22 01:56:52 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.92 GiB total -(Worker_TP1 pid=444) DEBUG 04-22 01:56:52 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP1 pid=444) INFO 04-22 01:56:52 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.92 GiB total -(Worker_TP0 pid=443) DEBUG 04-22 01:56:52 [v1/worker/gpu_worker.py:424] Initial free memory: 77.66 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP0 pid=443) DEBUG 04-22 01:56:52 [v1/worker/gpu_worker.py:430] Free memory after profiling: 41.35 GiB (total), 38.92 GiB (within requested) -(Worker_TP0 pid=443) DEBUG 04-22 01:56:52 [v1/worker/gpu_worker.py:435] Memory profiling takes 43.50 seconds. Total non KV cache memory: 37.95GiB; torch peak memory increase: 1.96GiB; non-torch forward increase memory: 2.11GiB; weights memory: 33.88GiB. -(Worker_TP0 pid=443) INFO 04-22 01:56:52 [v1/worker/gpu_worker.py:436] Available KV cache memory: 37.28 GiB -(Worker_TP0 pid=443) INFO 04-22 01:56:52 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9743 to maintain the same effective KV cache size. -(Worker_TP0 pid=443) DEBUG 04-22 01:56:52 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=244) DEBUG 04-22 01:56:52 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=244) DEBUG 04-22 01:56:52 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=444) DEBUG 04-22 01:56:52 [v1/worker/gpu_worker.py:424] Initial free memory: 77.66 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP1 pid=444) DEBUG 04-22 01:56:52 [v1/worker/gpu_worker.py:430] Free memory after profiling: 41.35 GiB (total), 38.92 GiB (within requested) -(Worker_TP1 pid=444) DEBUG 04-22 01:56:52 [v1/worker/gpu_worker.py:435] Memory profiling takes 43.59 seconds. Total non KV cache memory: 37.95GiB; torch peak memory increase: 1.96GiB; non-torch forward increase memory: 2.11GiB; weights memory: 33.88GiB. -(Worker_TP1 pid=444) INFO 04-22 01:56:52 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9743 to maintain the same effective KV cache size. -(Worker_TP1 pid=444) DEBUG 04-22 01:56:52 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=244) DEBUG 04-22 01:56:52 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=244) INFO 04-22 01:56:52 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 244,304 tokens -(EngineCore pid=244) INFO 04-22 01:56:52 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 29.82x -(Worker_TP1 pid=444) DEBUG 04-22 01:56:52 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=443) DEBUG 04-22 01:56:52 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=444) INFO 04-22 01:56:52 [v1/worker/gpu_worker.py:578] Compile and warming up model for size 8192 -(Worker_TP0 pid=443) INFO 04-22 01:56:52 [v1/worker/gpu_worker.py:578] Compile and warming up model for size 8192 -(Worker_TP1 pid=444) DEBUG 04-22 01:56:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=443) DEBUG 04-22 01:56:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=443) 2026-04-22 01:56:52,690 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP1 pid=444) 2026-04-22 01:56:52,690 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP0 pid=443) DEBUG 04-22 01:56:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=444) DEBUG 04-22 01:56:52 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=443) 2026-04-22 01:56:53,303 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP1 pid=444) 2026-04-22 01:56:53,303 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=244) DEBUG 04-22 01:56:53 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=444) DEBUG 04-22 01:56:53 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=443) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=244) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=244) INFO 04-22 01:57:03 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(Worker_TP0 pid=443) DEBUG 04-22 01:57:03 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=444) DEBUG 04-22 01:57:03 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=244) DEBUG 04-22 01:57:03 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=244) DEBUG 04-22 01:57:03 [v1/engine/core.py:1158] EngineCore waiting for work. -(EngineCore pid=244) DEBUG 04-22 01:57:03 [v1/engine/core.py:1158] EngineCore waiting for work. -(APIServer pid=1) INFO 04-22 01:57:03 [entrypoints/openai/api_server.py:590] Supported tasks: ['generate'] -(APIServer pid=1) WARNING 04-22 01:57:04 [config/model.py:1435] Default vLLM sampling parameters have been overridden by the model's `generation_config.json`: `{'temperature': 0.6, 'top_p': 0.9}`. If this is not intended, please relaunch vLLM instance with `--generation-config vllm`. -(APIServer pid=1) DEBUG 04-22 01:57:04 [renderers/base.py:197] Warming up chat template processing... -(APIServer pid=1) DEBUG 04-22 01:57:04 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e82af0-730e730b2a43419e17ae13cc;6e56a674-0198-405a-8e28-0b9e1442f3df) -(APIServer pid=1) DEBUG 04-22 01:57:04 [transformers_utils/repo_utils.py:243] -(APIServer pid=1) DEBUG 04-22 01:57:04 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/RedHatAI/Llama-3.3-70B-Instruct-quantized.w8a8/resolve/main/processor_config.json. -(APIServer pid=1) DEBUG 04-22 01:57:04 [transformers_utils/repo_utils.py:243] File or repository not found in hf_hub_download: 404 Client Error. (Request ID: Root=1-69e82af0-02cf5c644987fd933928fe23;bce00550-b3f4-4c54-90ef-7f32f9f19154) -(APIServer pid=1) DEBUG 04-22 01:57:04 [transformers_utils/repo_utils.py:243] -(APIServer pid=1) DEBUG 04-22 01:57:04 [transformers_utils/repo_utils.py:243] Entry Not Found for url: https://huggingface.co/RedHatAI/Llama-3.3-70B-Instruct-quantized.w8a8/resolve/main/preprocessor_config.json. -(Worker_TP0 pid=443) DEBUG 04-22 01:57:04 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=444) DEBUG 04-22 01:57:04 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(APIServer pid=1) INFO 04-22 01:57:05 [renderers/hf.py:314] Detected the chat template content format to be 'string'. You can set `--chat-template-content-format` to override this. -(APIServer pid=1) DEBUG 04-22 01:57:05 [renderers/base.py:203] Chat template warmup completed in 1.355s -(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/openai/api_server.py:594] Starting vLLM server on http://0.0.0.0:8000 -(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:37] Available routes are: -(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /openapi.json, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /docs, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /docs/oauth2-redirect, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /redoc, Methods: GET, HEAD -(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /tokenize, Methods: POST -(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /detokenize, Methods: POST -(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /load, Methods: GET -(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /version, Methods: GET -(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /health, Methods: GET -(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /metrics, Methods: GET -(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /v1/models, Methods: GET -(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /ping, Methods: GET -(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /ping, Methods: POST -(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /invocations, Methods: POST -(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /v1/chat/completions, Methods: POST -(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST -(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /v1/responses, Methods: POST -(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET -(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST -(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /v1/completions, Methods: POST -(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /v1/messages, Methods: POST -(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST -(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /inference/v1/generate, Methods: POST -(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /scale_elastic_ep, Methods: POST -(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST -(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /v1/chat/completions/render, Methods: POST -(APIServer pid=1) INFO 04-22 01:57:05 [entrypoints/launcher.py:46] Route: /v1/completions/render, Methods: POST -(APIServer pid=1) INFO: Started server process [1] -(APIServer pid=1) INFO: Waiting for application startup. -(APIServer pid=1) INFO: Application startup complete. -(APIServer pid=1) DEBUG 04-22 01:57:11 [v1/engine/async_llm.py:875] Called check_health. -(APIServer pid=1) INFO: 10.131.2.2:58456 - "GET /health HTTP/1.1" 200 OK diff --git a/accuracy/results/v0.19.0/logs/w4a16-redhatai-lla--h100-80gb--tp1pp1dp1--8192-dtf16.log b/accuracy/results/v0.19.0/logs/w4a16-redhatai-lla--h100-80gb--tp1pp1dp1--8192-dtf16.log deleted file mode 100644 index 94e136a0..00000000 --- a/accuracy/results/v0.19.0/logs/w4a16-redhatai-lla--h100-80gb--tp1pp1dp1--8192-dtf16.log +++ /dev/null @@ -1,745 +0,0 @@ -DEBUG 04-22 00:10:51 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:10:51 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:10:51 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:10:51 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:10:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:10:51 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:10:51 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:10:51 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:10:51 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:10:51 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:10:51 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:10:51 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:10:56 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 00:10:58 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' -DEBUG 04-22 00:10:58 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 00:10:58 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:10:58 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:10:58 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 00:10:58 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:10:58 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 00:10:58 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 00:10:58 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16 -(APIServer pid=1) INFO 04-22 00:10:58 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 00:10:58 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:10:58 [entrypoints/utils.py:233] non-default args: {'model_tag': 'RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16', 'model': 'RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16', 'dtype': 'float16', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 00:10:58 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 00:10:58 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 00:10:58 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0004022 secs -(APIServer pid=1) INFO 04-22 00:10:58 [config/model.py:549] Resolved architecture: LlamaForCausalLM -(APIServer pid=1) INFO 04-22 00:10:58 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) INFO 04-22 00:10:59 [model_executor/.../quantization/gptq_marlin.py:229] The model is convertible to gptq_marlin during runtime. Using gptq_marlin kernel. -(APIServer pid=1) DEBUG 04-22 00:10:59 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 00:10:59 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 00:10:59 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 00:10:59 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 00:10:59 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 00:10:59 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 00:10:59 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 00:10:59 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 00:10:59 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:11:00 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:11:00 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 00:11:03 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:11:03 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:11:03 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:11:03 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:11:03 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:11:03 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:11:03 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:11:03 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:11:03 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:11:03 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:11:03 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:11:04 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:11:08 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=244) DEBUG 04-22 00:11:10 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 00:11:10 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=244) DEBUG 04-22 00:11:10 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/429d0cbc-ad35-4f71-b5c3-82da97b23547'], outputs=['ipc:///tmp/1b75cf70-26aa-4bf5-90dd-5602d37f9019'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=244) DEBUG 04-22 00:11:10 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=244) DEBUG 04-22 00:11:10 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=244) DEBUG 04-22 00:11:10 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=244) DEBUG 04-22 00:11:10 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=244) DEBUG 04-22 00:11:10 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=244) INFO 04-22 00:11:10 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16', speculative_config=None, tokenizer='RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=gptq_marlin, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=244) DEBUG 04-22 00:11:10 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.24:37197 backend=nccl -(EngineCore pid=244) INFO 04-22 00:11:10 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.24:37197 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=244) DEBUG 04-22 00:11:10 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=244) INFO 04-22 00:11:10 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=244) DEBUG 04-22 00:11:11 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776816671.2624762, auto_measure=True -(EngineCore pid=244) DEBUG 04-22 00:11:11 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=244) DEBUG 04-22 00:11:11 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=244) DEBUG 04-22 00:11:11 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=244) DEBUG 04-22 00:11:11 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=244) DEBUG 04-22 00:11:11 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=244) DEBUG 04-22 00:11:11 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=244) DEBUG 04-22 00:11:11 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=244) DEBUG 04-22 00:11:11 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=244) INFO 04-22 00:11:11 [v1/worker/gpu_model_runner.py:4735] Starting to load model RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w4a16... -(EngineCore pid=244) INFO 04-22 00:11:11 [model_executor/.../quantization/gptq_marlin.py:376] Using MacheteLinearKernel for GPTQMarlinLinearMethod -(EngineCore pid=244) DEBUG 04-22 00:11:12 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.float16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=244) INFO 04-22 00:11:12 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=244) INFO 04-22 00:11:12 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=244) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=244) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=244) DEBUG 04-22 00:11:12 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=244) DEBUG 04-22 00:11:12 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=244) DEBUG 04-22 00:11:12 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=244) DEBUG 04-22 00:11:12 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=244) DEBUG 04-22 00:11:12 [model_executor/model_loader/weight_utils.py:557] Using model weights format ['*.safetensors'] -(EngineCore pid=244) INFO 04-22 00:11:12 [model_executor/model_loader/weight_utils.py:625] No model.safetensors.index.json found in remote. -(EngineCore pid=244) Loading safetensors checkpoint shards: 0% Completed | 0/1 [00:00 -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/mixed_precision/MPLinearKernel.py -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/mixed_precision/machete.py -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/gptq_marlin.py -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(EngineCore pid=244) INFO 04-22 00:11:26 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/1ed2f413cc/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=9913286625 comp=e546579c48 code=c108c187e9321c2995692e3c6585a7f067e86dd4cdaa92de1c48e27c81f442dc dir=/data/.cache/vllm/torch_compile_cache/1ed2f413cc/rank_0_0/backbone -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/backends.py:1074] Vllm config hash: 9913286625 -(EngineCore pid=244) INFO 04-22 00:11:26 [compilation/backends.py:1111] Dynamo bytecode transform time: 5.44 s -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=244) DEBUG 04-22 00:11:26 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=244) DEBUG 04-22 00:11:27 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=244) DEBUG 04-22 00:11:27 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(EngineCore pid=244) DEBUG 04-22 00:11:27 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms -(EngineCore pid=244) DEBUG 04-22 00:11:27 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=244) DEBUG 04-22 00:11:27 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=244) INFO 04-22 00:11:29 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=244) DEBUG 04-22 00:11:29 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/1ed2f413cc/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=244) DEBUG 04-22 00:11:30 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=244) DEBUG 04-22 00:11:30 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(EngineCore pid=244) DEBUG 04-22 00:11:30 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.4 ms -(EngineCore pid=244) DEBUG 04-22 00:11:30 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=244) DEBUG 04-22 00:11:30 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(APIServer pid=1) DEBUG 04-22 00:11:30 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=244) DEBUG 04-22 00:11:31 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/1ed2f413cc/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=244) DEBUG 04-22 00:11:32 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=244) DEBUG 04-22 00:11:32 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.0 ms -(EngineCore pid=244) DEBUG 04-22 00:11:32 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.0 ms -(EngineCore pid=244) DEBUG 04-22 00:11:32 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=244) DEBUG 04-22 00:11:32 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(EngineCore pid=244) DEBUG 04-22 00:11:32 [compilation/backends.py:377] Store the 32-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_32', '/data/.cache/vllm/torch_compile_cache/1ed2f413cc/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_32') -(EngineCore pid=244) INFO 04-22 00:11:32 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 6.01 s -(EngineCore pid=244) DEBUG 04-22 00:11:33 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/1ed2f413cc/rank_0_0/backbone/computation_graph.py -(EngineCore pid=244) INFO 04-22 00:11:34 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/b11bf988391eee29fa0da7d22d77941137b65c24097fd54712a4123fb7071154/rank_0_0/model -(EngineCore pid=244) INFO 04-22 00:11:34 [compilation/monitor.py:48] torch.compile took 13.36 s in total -(EngineCore pid=244) INFO 04-22 00:11:34 [compilation/monitor.py:76] Initial profiling/warmup run took 0.40 s -(APIServer pid=1) DEBUG 04-22 00:11:40 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=244) INFO 04-22 00:11:40 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=244) DEBUG 04-22 00:11:40 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=244) INFO 04-22 00:11:40 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=244) DEBUG 04-22 00:11:40 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:11:40 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:11:40 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-22 00:11:40 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:11:40 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:11:40 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-22 00:11:40 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 126.00 MiB first-capture + (51-1) × 8.00 MiB per-graph -(EngineCore pid=244) DEBUG 04-22 00:11:40 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:11:40 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:11:40 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-22 00:11:40 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:11:40 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:11:40 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-22 00:11:40 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 262.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(EngineCore pid=244) DEBUG 04-22 00:11:41 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=244) INFO 04-22 00:11:41 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.84 GiB total -(EngineCore pid=244) DEBUG 04-22 00:11:41 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=244) DEBUG 04-22 00:11:41 [v1/worker/gpu_worker.py:430] Free memory after profiling: 72.84 GiB (total), 69.4 GiB (within requested) -(EngineCore pid=244) DEBUG 04-22 00:11:41 [v1/worker/gpu_worker.py:435] Memory profiling takes 20.61 seconds. Total non KV cache memory: 7.51GiB; torch peak memory increase: 1.89GiB; non-torch forward increase memory: 0.25GiB; weights memory: 5.38GiB. -(EngineCore pid=244) INFO 04-22 00:11:41 [v1/worker/gpu_worker.py:436] Available KV cache memory: 67.71 GiB -(EngineCore pid=244) INFO 04-22 00:11:41 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9606 to maintain the same effective KV cache size. -(EngineCore pid=244) INFO 04-22 00:11:41 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 554,704 tokens -(EngineCore pid=244) INFO 04-22 00:11:41 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 67.71x -(EngineCore pid=244) 2026-04-22 00:11:41,843 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=244) DEBUG 04-22 00:11:41 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) 2026-04-22 00:11:41,854 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=244) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:12:17 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:12:17 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 00:12:17 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:12:17 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 00:12:17 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 00:12:17 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8 -(APIServer pid=1) INFO 04-22 00:12:17 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 00:12:17 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:12:17 [entrypoints/utils.py:233] non-default args: {'model_tag': 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8', 'model': 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 00:12:17 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 00:12:18 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.mistral3.Mistral3ForConditionalGeneration from cache -(APIServer pid=1) DEBUG 04-22 00:12:18 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0016757 secs -(APIServer pid=1) INFO 04-22 00:12:18 [config/model.py:549] Resolved architecture: Mistral3ForConditionalGeneration -(APIServer pid=1) INFO 04-22 00:12:18 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 00:12:18 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 00:12:18 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 00:12:18 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 00:12:18 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 00:12:18 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 00:12:18 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 00:12:18 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 00:12:18 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 00:12:18 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. -(APIServer pid=1) DEBUG 04-22 00:12:19 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:12:19 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. -(APIServer pid=1) DEBUG 04-22 00:12:19 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -(APIServer pid=1) The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. -(APIServer pid=1) The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. -DEBUG 04-22 00:12:27 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:12:27 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:12:27 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:12:27 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:12:27 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:12:27 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:12:27 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:12:27 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:12:27 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:12:27 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:12:27 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:12:27 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:12:32 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=468) DEBUG 04-22 00:12:34 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 00:12:34 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=468) DEBUG 04-22 00:12:34 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/ecc5eda6-a988-401c-90b3-adab9ce9c938'], outputs=['ipc:///tmp/89b6769d-3511-4956-9b04-41e6ab5c2ec1'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=468) DEBUG 04-22 00:12:34 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=468) DEBUG 04-22 00:12:34 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=468) DEBUG 04-22 00:12:34 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=468) DEBUG 04-22 00:12:34 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=468) DEBUG 04-22 00:12:34 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=468) INFO 04-22 00:12:34 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8', speculative_config=None, tokenizer='RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=468) DEBUG 04-22 00:12:34 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(EngineCore pid=468) The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. -(EngineCore pid=468) DEBUG 04-22 00:12:35 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.4.189:52183 backend=nccl -(EngineCore pid=468) INFO 04-22 00:12:35 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.4.189:52183 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=468) DEBUG 04-22 00:12:35 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=468) INFO 04-22 00:12:35 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=468) DEBUG 04-22 00:12:36 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776816756.0873954, auto_measure=True -(EngineCore pid=468) DEBUG 04-22 00:12:36 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=468) DEBUG 04-22 00:12:36 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=468) DEBUG 04-22 00:12:36 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=468) DEBUG 04-22 00:12:36 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=468) DEBUG 04-22 00:12:36 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=468) DEBUG 04-22 00:12:36 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=468) DEBUG 04-22 00:12:36 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=468) DEBUG 04-22 00:12:36 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. -(EngineCore pid=468) The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. -(EngineCore pid=468) The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. -(EngineCore pid=468) DEBUG 04-22 00:12:40 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=468) INFO 04-22 00:12:40 [v1/worker/gpu_model_runner.py:4735] Starting to load model RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8... -(EngineCore pid=468) INFO 04-22 00:12:41 [config/vllm.py:790] Asynchronous scheduling is enabled. -(EngineCore pid=468) DEBUG 04-22 00:12:41 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds', 't_cond'] -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.0.self_attn.qkv_proj -(EngineCore pid=468) INFO 04-22 00:12:41 [model_executor/.../linear/__init__.py:291] Selected CutlassInt8ScaledMMLinearKernel for CompressedTensorsW8A8Int8 -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.0.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=468) INFO 04-22 00:12:41 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=468) INFO 04-22 00:12:41 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.0.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.0.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.1.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.1.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.1.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.1.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.2.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.2.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.2.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.2.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.3.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.3.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.3.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.3.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.4.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.4.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.4.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.4.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.5.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.5.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.5.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.5.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.6.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.6.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.6.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.6.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.7.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.7.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.7.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.7.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.8.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.8.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.8.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.8.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.9.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.9.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.9.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.9.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.10.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.10.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.10.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.10.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.11.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.11.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.11.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.11.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.12.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.12.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.12.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.12.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.13.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.13.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.13.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.13.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.14.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.14.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.14.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.14.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.15.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.15.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.15.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.15.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.16.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.16.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.16.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.16.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.17.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.17.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.17.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.17.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.18.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.18.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.18.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.18.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.19.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.19.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.19.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.19.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.20.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.20.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.20.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.20.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.21.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.21.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.21.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.21.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.22.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.22.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.22.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.22.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.23.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.23.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.23.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.23.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.24.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.24.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.24.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.24.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.25.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.25.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.25.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.25.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.26.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.26.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.26.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.26.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.27.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.27.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.27.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.27.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.28.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.28.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.28.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.28.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.29.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.29.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.29.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.29.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.30.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.30.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.30.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.30.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.31.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.31.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.31.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.31.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.32.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.32.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.32.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.32.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.33.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.33.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.33.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.33.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.34.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.34.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.34.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.34.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.35.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.35.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.35.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.35.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.36.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.36.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.36.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.36.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.37.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.37.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.37.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.37.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.38.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.38.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.38.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.38.mlp.down_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.39.self_attn.qkv_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.39.self_attn.o_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.39.mlp.gate_up_proj -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.39.mlp.down_proj -(EngineCore pid=468) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=468) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=468) DEBUG 04-22 00:12:41 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=468) DEBUG 04-22 00:12:41 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=468) DEBUG 04-22 00:12:41 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=468) DEBUG 04-22 00:12:41 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 131, 'silu_and_mul': 40, 'conv2d': 1, 'gelu_and_mul': 1, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=468) DEBUG 04-22 00:12:41 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=468) DEBUG 04-22 00:12:41 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 131, 'silu_and_mul': 40, 'conv2d': 1, 'gelu_and_mul': 1, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=468) DEBUG 04-22 00:12:41 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00003-of-00006.safetensors', 'model-00005-of-00006.safetensors', 'model-00002-of-00006.safetensors', 'model-00006-of-00006.safetensors', 'model-00004-of-00006.safetensors', 'model-00001-of-00006.safetensors']] -(EngineCore pid=468) Loading safetensors checkpoint shards: 0% Completed | 0/6 [00:00 -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/mistral.py -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=468) INFO 04-22 00:13:11 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/327f08bb6e/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=a079653aeb comp=e546579c48 code=c676488f783f7fceec1e2b8e4d429af70dfba617895ac42a7314b1ea025a61d6 dir=/data/.cache/vllm/torch_compile_cache/327f08bb6e/rank_0_0/backbone -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=468) DEBUG 04-22 00:13:11 [compilation/backends.py:1074] Vllm config hash: a079653aeb -(EngineCore pid=468) INFO 04-22 00:13:11 [compilation/backends.py:1111] Dynamo bytecode transform time: 6.84 s -(EngineCore pid=468) DEBUG 04-22 00:13:12 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=468) DEBUG 04-22 00:13:12 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=468) DEBUG 04-22 00:13:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=468) DEBUG 04-22 00:13:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(EngineCore pid=468) DEBUG 04-22 00:13:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms -(EngineCore pid=468) DEBUG 04-22 00:13:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=468) DEBUG 04-22 00:13:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(APIServer pid=1) DEBUG 04-22 00:13:14 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=468) INFO 04-22 00:13:15 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=468) DEBUG 04-22 00:13:15 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/327f08bb6e/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=468) DEBUG 04-22 00:13:15 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=468) DEBUG 04-22 00:13:15 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(EngineCore pid=468) DEBUG 04-22 00:13:15 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.5 ms -(EngineCore pid=468) DEBUG 04-22 00:13:15 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=468) DEBUG 04-22 00:13:15 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=468) DEBUG 04-22 00:13:17 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/327f08bb6e/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=468) DEBUG 04-22 00:13:17 [compilation/backends.py:377] Store the 2-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_2', '/data/.cache/vllm/torch_compile_cache/327f08bb6e/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_2') -(EngineCore pid=468) DEBUG 04-22 00:13:19 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=468) DEBUG 04-22 00:13:19 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.1 ms -(EngineCore pid=468) DEBUG 04-22 00:13:19 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.2 ms -(EngineCore pid=468) DEBUG 04-22 00:13:19 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=468) DEBUG 04-22 00:13:19 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(EngineCore pid=468) DEBUG 04-22 00:13:19 [compilation/backends.py:377] Store the 40-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_40', '/data/.cache/vllm/torch_compile_cache/327f08bb6e/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_40') -(EngineCore pid=468) INFO 04-22 00:13:19 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 7.45 s -(EngineCore pid=468) DEBUG 04-22 00:13:19 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/327f08bb6e/rank_0_0/backbone/computation_graph.py -(EngineCore pid=468) INFO 04-22 00:13:21 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/0fc0a88d52a7ba91dbb88a644ae64adcbc9dfc7015c5ff89047a59dc90fdd3ce/rank_0_0/model -(EngineCore pid=468) INFO 04-22 00:13:21 [compilation/monitor.py:48] torch.compile took 16.46 s in total -(EngineCore pid=468) INFO 04-22 00:13:22 [compilation/monitor.py:76] Initial profiling/warmup run took 0.77 s -(APIServer pid=1) DEBUG 04-22 00:13:24 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=468) INFO 04-22 00:13:27 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=468) DEBUG 04-22 00:13:27 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=468) INFO 04-22 00:13:27 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=468) DEBUG 04-22 00:13:28 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=468) DEBUG 04-22 00:13:28 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=468) DEBUG 04-22 00:13:28 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=468) DEBUG 04-22 00:13:28 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=468) DEBUG 04-22 00:13:28 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=468) DEBUG 04-22 00:13:28 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=468) DEBUG 04-22 00:13:28 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 198.00 MiB first-capture + (51-1) × 8.00 MiB per-graph -(EngineCore pid=468) DEBUG 04-22 00:13:28 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=468) DEBUG 04-22 00:13:28 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=468) DEBUG 04-22 00:13:28 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=468) DEBUG 04-22 00:13:28 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=468) DEBUG 04-22 00:13:28 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=468) DEBUG 04-22 00:13:28 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=468) DEBUG 04-22 00:13:28 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 262.00 MiB first-capture + (51-1) × 8.00 MiB per-graph -(EngineCore pid=468) DEBUG 04-22 00:13:28 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=468) INFO 04-22 00:13:28 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.04 GiB total -(EngineCore pid=468) DEBUG 04-22 00:13:29 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=468) DEBUG 04-22 00:13:29 [v1/worker/gpu_worker.py:430] Free memory after profiling: 54.11 GiB (total), 50.66 GiB (within requested) -(EngineCore pid=468) DEBUG 04-22 00:13:29 [v1/worker/gpu_worker.py:435] Memory profiling takes 24.83 seconds. Total non KV cache memory: 26.5GiB; torch peak memory increase: 2.18GiB; non-torch forward increase memory: 0.26GiB; weights memory: 24.07GiB. -(EngineCore pid=468) INFO 04-22 00:13:29 [v1/worker/gpu_worker.py:436] Available KV cache memory: 48.73 GiB -(EngineCore pid=468) INFO 04-22 00:13:29 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9631 to maintain the same effective KV cache size. -(EngineCore pid=468) INFO 04-22 00:13:29 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 319,344 tokens -(EngineCore pid=468) INFO 04-22 00:13:29 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 38.98x -(EngineCore pid=468) 2026-04-22 00:13:29,131 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=468) DEBUG 04-22 00:13:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=468) 2026-04-22 00:13:29,143 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=468) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:14:02 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:14:02 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 00:14:02 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:14:02 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 00:14:02 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 00:14:02 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8 -(APIServer pid=1) INFO 04-22 00:14:02 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 00:14:02 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:14:02 [entrypoints/utils.py:233] non-default args: {'model_tag': 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8', 'model': 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8', 'max_model_len': 8192, 'tensor_parallel_size': 2, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 00:14:02 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 00:14:03 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.mistral3.Mistral3ForConditionalGeneration from cache -(APIServer pid=1) DEBUG 04-22 00:14:03 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0014510 secs -(APIServer pid=1) INFO 04-22 00:14:03 [config/model.py:549] Resolved architecture: Mistral3ForConditionalGeneration -(APIServer pid=1) INFO 04-22 00:14:03 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 00:14:03 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 00:14:03 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 00:14:03 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 00:14:03 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 00:14:03 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 00:14:03 [config/parallel.py:743] Defaulting to use mp for distributed inference -(APIServer pid=1) INFO 04-22 00:14:03 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 00:14:03 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(APIServer pid=1) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(APIServer pid=1) INFO 04-22 00:14:04 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(APIServer pid=1) DEBUG 04-22 00:14:04 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 00:14:04 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. -(APIServer pid=1) DEBUG 04-22 00:14:05 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:14:05 [utils/torch_utils.py:119] OMP_NUM_THREADS is not set; defaulting Torch threads to 1. -(APIServer pid=1) DEBUG 04-22 00:14:05 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -(APIServer pid=1) The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. -(APIServer pid=1) The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. -DEBUG 04-22 00:14:14 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:14:14 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:14:14 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:14:14 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:14:14 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:14:14 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:14:14 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:14:14 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:14:14 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:14:14 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:14:14 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:14:14 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:14:19 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(APIServer pid=1) DEBUG 04-22 00:14:20 [v1/engine/utils.py:1042] Waiting for 1 local, 0 remote core engine proc(s) to connect. -(EngineCore pid=469) DEBUG 04-22 00:14:20 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 00:14:20 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=469) DEBUG 04-22 00:14:20 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/d25f1ed3-6fc0-4d15-8be0-03802292156b'], outputs=['ipc:///tmp/87c7c8cd-7976-4324-bead-27df712c1084'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=469) DEBUG 04-22 00:14:20 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=469) DEBUG 04-22 00:14:20 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=469) DEBUG 04-22 00:14:20 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=469) DEBUG 04-22 00:14:20 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=469) DEBUG 04-22 00:14:20 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=469) INFO 04-22 00:14:20 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8', speculative_config=None, tokenizer='RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [6553, 8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': True}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=469) WARNING 04-22 00:14:20 [v1/executor/multiproc_executor.py:1014] Reducing Torch parallelism from 112 threads to 1 to avoid unnecessary CPU contention. Set OMP_NUM_THREADS in the external environment to tune this value as needed. -(EngineCore pid=469) INFO 04-22 00:14:20 [v1/executor/multiproc_executor.py:134] DP group leader: node_rank=0, node_rank_within_dp=0, master_addr=127.0.0.1, mq_connect_ip=10.129.9.25 (local), world_size=2, local_world_size=2 -(EngineCore pid=469) DEBUG 04-22 00:14:20 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/6c93d54c-bb41-42ff-b22e-c8895b3ada8f -(EngineCore pid=469) DEBUG 04-22 00:14:20 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[0, 1], buffer_handle=(2, 16777216, 10, 'psm_873c0090'), local_subscribe_addr='ipc:///tmp/6c93d54c-bb41-42ff-b22e-c8895b3ada8f', local_notify_addr='ipc:///tmp/089c5772-3312-4148-b8a6-d9a896842e06', remote_subscribe_addr=None, remote_addr_ipv6=False) -DEBUG 04-22 00:14:24 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:14:24 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:14:24 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:14:24 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:14:24 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:14:24 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:14:24 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:14:24 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:14:24 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:14:24 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:14:24 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:14:24 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:14:24 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:14:24 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:14:24 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:14:24 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:14:24 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:14:24 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:14:24 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:14:24 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:14:24 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:14:24 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:14:24 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:14:24 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:14:29 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 00:14:29 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 00:14:30 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 00:14:30 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:14:30 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:14:30 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -DEBUG 04-22 00:14:30 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 00:14:30 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:14:30 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:14:30 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) DEBUG 04-22 00:14:30 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -DEBUG 04-22 00:14:31 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -DEBUG 04-22 00:14:31 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. -The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. -(Worker pid=668) DEBUG 04-22 00:14:32 [distributed/parallel_state.py:1356] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:56531 backend=nccl -(Worker pid=668) INFO 04-22 00:14:32 [distributed/parallel_state.py:1400] world_size=2 rank=0 local_rank=0 distributed_init_method=tcp://127.0.0.1:56531 backend=nccl -(Worker pid=669) DEBUG 04-22 00:14:32 [distributed/parallel_state.py:1356] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:56531 backend=nccl -(Worker pid=669) INFO 04-22 00:14:32 [distributed/parallel_state.py:1400] world_size=2 rank=1 local_rank=1 distributed_init_method=tcp://127.0.0.1:56531 backend=nccl -[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -(Worker pid=669) DEBUG 04-22 00:14:32 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -(Worker pid=668) DEBUG 04-22 00:14:32 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 1 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -[Gloo] Rank 0 is connected to 1 peer ranks. Expected number of connected peer ranks is : 1 -(Worker pid=668) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=668) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=668) DEBUG 04-22 00:14:33 [utils/nccl.py:34] Found nccl from library libnccl.so.2 -(Worker pid=668) INFO 04-22 00:14:33 [distributed/device_communicators/pynccl.py:111] vLLM is using nccl==2.27.5 -(Worker pid=669) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(Worker pid=669) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(Worker pid=669) DEBUG 04-22 00:14:34 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=668) DEBUG 04-22 00:14:34 [distributed/device_communicators/custom_all_reduce.py:37] Skipping P2P check and trusting the driver's P2P report. -(Worker pid=668) DEBUG 04-22 00:14:34 [distributed/device_communicators/shm_broadcast.py:390] Binding to ipc:///tmp/c98dad95-8292-4c41-9cae-565225952c2b -(Worker pid=668) DEBUG 04-22 00:14:34 [distributed/device_communicators/shm_broadcast.py:443] vLLM message queue communication handle: Handle(local_reader_ranks=[1], buffer_handle=(1, 4194304, 6, 'psm_df50e73a'), local_subscribe_addr='ipc:///tmp/c98dad95-8292-4c41-9cae-565225952c2b', local_notify_addr='ipc:///tmp/c21e2440-2fd2-47d3-a452-a26a1e6d2c8c', remote_subscribe_addr=None, remote_addr_ipv6=False) -(Worker pid=669) DEBUG 04-22 00:14:34 [distributed/device_communicators/shm_broadcast.py:467] Connecting to ipc:///tmp/c98dad95-8292-4c41-9cae-565225952c2b -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(Worker pid=668) INFO 04-22 00:14:34 [distributed/parallel_state.py:1716] rank 0 in world size 2 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(Worker pid=669) DEBUG 04-22 00:14:34 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.66GiB, total_memory=79.19GiB, cuda_memory=1.53GiB, torch_memory=0.02GiB, non_torch_memory=1.51GiB, timestamp=1776816874.35419, auto_measure=True -(Worker pid=669) DEBUG 04-22 00:14:34 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=668) DEBUG 04-22 00:14:34 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.01GiB, free_memory=77.66GiB, total_memory=79.19GiB, cuda_memory=1.53GiB, torch_memory=0.02GiB, non_torch_memory=1.51GiB, timestamp=1776816874.3785136, auto_measure=True -(Worker pid=668) DEBUG 04-22 00:14:34 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(Worker pid=669) DEBUG 04-22 00:14:34 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=669) DEBUG 04-22 00:14:34 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=669) DEBUG 04-22 00:14:34 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=668) DEBUG 04-22 00:14:34 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=668) DEBUG 04-22 00:14:34 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(Worker pid=668) DEBUG 04-22 00:14:34 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(Worker pid=669) DEBUG 04-22 00:14:34 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=669) DEBUG 04-22 00:14:34 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=668) DEBUG 04-22 00:14:34 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(Worker pid=668) DEBUG 04-22 00:14:34 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(Worker pid=668) DEBUG 04-22 00:14:34 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(Worker pid=669) The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. -(Worker pid=668) The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. -(Worker pid=669) The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. -(Worker pid=668) The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. -(Worker pid=668) DEBUG 04-22 00:14:39 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(Worker_TP0 pid=668) INFO 04-22 00:14:39 [v1/worker/gpu_model_runner.py:4735] Starting to load model RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8... -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds', 't_cond'] -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.0.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.0.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.0.mlp.gate_up_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.0.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.1.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.1.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.1.mlp.gate_up_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.1.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.2.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.2.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.2.mlp.gate_up_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.2.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.3.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.3.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.3.mlp.gate_up_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.3.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.4.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.4.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.4.mlp.gate_up_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.4.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.5.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.5.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.5.mlp.gate_up_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.5.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.6.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.6.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.6.mlp.gate_up_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.6.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.7.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.7.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.7.mlp.gate_up_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.7.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.8.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.8.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.8.mlp.gate_up_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.8.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.9.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.9.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.9.mlp.gate_up_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.9.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.10.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.10.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.10.mlp.gate_up_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.10.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.11.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.11.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.11.mlp.gate_up_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.11.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.12.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.12.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.12.mlp.gate_up_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.12.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.13.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.13.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.13.mlp.gate_up_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.13.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.14.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.14.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.14.mlp.gate_up_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.14.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.15.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.15.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.15.mlp.gate_up_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.15.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.16.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.16.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.16.mlp.gate_up_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.16.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.17.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.17.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.17.mlp.gate_up_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.17.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.18.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.18.self_attn.o_proj -(Worker_TP0 pid=668) INFO 04-22 00:14:39 [config/vllm.py:790] Asynchronous scheduling is enabled. -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.18.mlp.gate_up_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.18.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.19.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.19.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.19.mlp.gate_up_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.19.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.20.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.20.self_attn.o_proj -(Worker_TP0 pid=668) INFO 04-22 00:14:39 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.20.mlp.gate_up_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.20.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds', 't_cond'] -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.0.self_attn.qkv_proj -(Worker_TP0 pid=668) INFO 04-22 00:14:39 [model_executor/.../linear/__init__.py:291] Selected CutlassInt8ScaledMMLinearKernel for CompressedTensorsW8A8Int8 -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.21.self_attn.qkv_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.0.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.21.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.21.mlp.gate_up_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.21.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.22.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.22.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.22.mlp.gate_up_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.22.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.23.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.23.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.23.mlp.gate_up_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.23.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.24.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.24.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.24.mlp.gate_up_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.24.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.25.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.25.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.25.mlp.gate_up_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.25.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.26.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.26.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(Worker_TP0 pid=668) INFO 04-22 00:14:39 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(Worker_TP0 pid=668) INFO 04-22 00:14:39 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.26.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.0.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.0.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.26.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.1.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.27.self_attn.qkv_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.1.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.27.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.1.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.1.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.27.mlp.gate_up_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.27.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.2.self_attn.qkv_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.2.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.28.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.28.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.2.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.2.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.28.mlp.gate_up_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.28.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.3.self_attn.qkv_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.3.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.3.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.3.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.29.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.29.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.29.mlp.gate_up_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.29.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.4.self_attn.qkv_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.4.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.4.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.4.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.30.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.30.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.30.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.5.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.30.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.5.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.5.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.5.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.31.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.31.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.31.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.6.self_attn.qkv_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.6.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.31.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.6.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.6.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.32.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.32.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.32.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.7.self_attn.qkv_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.7.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.32.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.7.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.7.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.33.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.33.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.8.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.33.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.8.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.33.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.8.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.8.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.34.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.34.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.9.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.34.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.9.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.34.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.9.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.9.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.35.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.35.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.10.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.35.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.10.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.35.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.10.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.10.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.36.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.36.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.11.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.36.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.11.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.36.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.11.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.11.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.37.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.37.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.12.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.37.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.12.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.37.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.12.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.12.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.38.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.38.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.13.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.38.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.13.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.38.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.13.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.13.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.39.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.39.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.14.self_attn.qkv_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.39.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.14.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.39.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.14.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.14.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.15.self_attn.qkv_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.15.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.15.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.15.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.16.self_attn.qkv_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.16.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.16.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.16.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.17.self_attn.qkv_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.17.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.17.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.17.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.18.self_attn.qkv_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.18.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.18.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.18.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.19.self_attn.qkv_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.19.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.19.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.19.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.20.self_attn.qkv_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.20.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.20.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.20.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.21.self_attn.qkv_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.21.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.21.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.21.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.22.self_attn.qkv_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.22.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.22.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.22.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.23.self_attn.qkv_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.23.self_attn.o_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.23.mlp.gate_up_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.23.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.24.self_attn.qkv_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.24.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.24.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.24.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.25.self_attn.qkv_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.25.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.25.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.25.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 131, 'silu_and_mul': 40, 'conv2d': 1, 'gelu_and_mul': 1, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.26.self_attn.qkv_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.26.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.26.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.26.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.27.self_attn.qkv_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.27.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.27.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.27.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.28.self_attn.qkv_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.28.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.28.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.28.mlp.down_proj -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 131, 'silu_and_mul': 40, 'conv2d': 1, 'gelu_and_mul': 1, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.29.self_attn.qkv_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.29.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.29.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.29.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.30.self_attn.qkv_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.30.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.30.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.30.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.31.self_attn.qkv_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.31.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.31.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.31.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.32.self_attn.qkv_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.32.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.32.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.32.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.33.self_attn.qkv_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.33.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.33.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.33.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.34.self_attn.qkv_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.34.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.34.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.34.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.35.self_attn.qkv_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.35.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.35.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.35.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.36.self_attn.qkv_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.36.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.36.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.36.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.37.self_attn.qkv_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.37.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.37.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.37.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.38.self_attn.qkv_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.38.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.38.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.38.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.39.self_attn.qkv_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.39.self_attn.o_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.39.mlp.gate_up_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for language_model.model.layers.39.mlp.down_proj -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 131, 'silu_and_mul': 40, 'conv2d': 1, 'gelu_and_mul': 1, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [config/compilation.py:1194] enabled custom ops: Counter() -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 131, 'silu_and_mul': 40, 'conv2d': 1, 'gelu_and_mul': 1, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(Worker_TP1 pid=669) DEBUG 04-22 00:14:39 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00006.safetensors', 'model-00001-of-00006.safetensors', 'model-00003-of-00006.safetensors', 'model-00005-of-00006.safetensors', 'model-00006-of-00006.safetensors', 'model-00004-of-00006.safetensors']] -(Worker_TP0 pid=668) DEBUG 04-22 00:14:39 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00004-of-00006.safetensors', 'model-00003-of-00006.safetensors', 'model-00001-of-00006.safetensors', 'model-00002-of-00006.safetensors', 'model-00006-of-00006.safetensors', 'model-00005-of-00006.safetensors']] -(Worker_TP0 pid=668) Loading safetensors checkpoint shards: 0% Completed | 0/6 [00:00 -(Worker_TP1 pid=669) DEBUG 04-22 00:14:59 [compilation/decorators.py:528] Start compiling function -(EngineCore pid=469) DEBUG 04-22 00:15:00 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(APIServer pid=1) DEBUG 04-22 00:15:00 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/mistral.py -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=041fbb8292 comp=e546579c48 code=fdff84f9a2f10e49a795b178e6bd05497bf710b53e4ed33c50d0ecc2ca179fc3 dir=/data/.cache/vllm/torch_compile_cache/14c0de09a2/rank_1_0/backbone -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] Vllm config hash: 041fbb8292 -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/mistral.py -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(Worker_TP0 pid=668) INFO 04-22 00:15:06 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/14c0de09a2/rank_0_0/backbone for vLLM's torch.compile -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=041fbb8292 comp=e546579c48 code=fdff84f9a2f10e49a795b178e6bd05497bf710b53e4ed33c50d0ecc2ca179fc3 dir=/data/.cache/vllm/torch_compile_cache/14c0de09a2/rank_0_0/backbone -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] Compile env factors (raw): -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'CUDA_HOME': None, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VERBOSE': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/backends.py:1074] Vllm config hash: 041fbb8292 -(Worker_TP0 pid=668) INFO 04-22 00:15:06 [compilation/backends.py:1111] Dynamo bytecode transform time: 6.95 s -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [compilation/.../fusion/allreduce_rms_fusion.py:765] Flashinfer max size: 64 MB,Maximal number of tokens used by Flashinfer Allreduce Fusion: 6553 -(Worker_TP0 pid=668) INFO 04-22 00:15:06 [distributed/device_communicators/flashinfer_all_reduce.py:109] Auto-selected flashinfer allreduce backend: trtllm -(Worker_TP0 pid=668) /usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py:83: UserWarning: barrier(): using the device under current context. You can specify `device_id` in `init_process_group` to mute this warning. -(Worker_TP0 pid=668) return func(*args, **kwargs) -(Worker_TP0 pid=668) DEBUG 04-22 00:15:06 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=0, max_token_num=6553, hidden_dim=5120, dtype=torch.bfloat16 -(Worker_TP1 pid=669) DEBUG 04-22 00:15:06 [distributed/device_communicators/flashinfer_all_reduce.py:79] Initialized FlashInfer All Reduce workspace: backend=trtllm, world_size=2, rank=1, max_token_num=6553, hidden_dim=5120, dtype=torch.bfloat16 -(Worker_TP0 pid=668) INFO 04-22 00:15:06 [distributed/device_communicators/flashinfer_all_reduce.py:149] Initialized FlashInfer Allreduce norm fusion workspace with backend=trtllm -(Worker_TP0 pid=668) DEBUG 04-22 00:15:07 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 6553), (6554, 8192)] -(Worker_TP0 pid=668) DEBUG 04-22 00:15:07 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(Worker_TP0 pid=668) DEBUG 04-22 00:15:08 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=668) DEBUG 04-22 00:15:08 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP0 pid=668) DEBUG 04-22 00:15:08 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 0 patterns -(Worker_TP0 pid=668) DEBUG 04-22 00:15:08 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 1.0 ms -(Worker_TP0 pid=668) DEBUG 04-22 00:15:08 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=668) DEBUG 04-22 00:15:08 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=668) DEBUG 04-22 00:15:08 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP1 pid=669) DEBUG 04-22 00:15:08 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=669) DEBUG 04-22 00:15:08 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP1 pid=669) DEBUG 04-22 00:15:08 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 0 patterns -(Worker_TP1 pid=669) DEBUG 04-22 00:15:08 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 1.1 ms -(Worker_TP1 pid=669) DEBUG 04-22 00:15:08 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=669) DEBUG 04-22 00:15:08 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=669) DEBUG 04-22 00:15:08 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=668) INFO 04-22 00:15:10 [compilation/backends.py:372] Cache the graph of compile range (1, 6553) for later use -(Worker_TP0 pid=668) DEBUG 04-22 00:15:10 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 6553) from inductor_standalone via handle ('artifact_compile_range_1_6553_subgraph_0', '/data/.cache/vllm/torch_compile_cache/14c0de09a2/rank_0_0/backbone/artifact_compile_range_1_6553_subgraph_0') -(APIServer pid=1) DEBUG 04-22 00:15:10 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP1 pid=669) DEBUG 04-22 00:15:10 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=669) DEBUG 04-22 00:15:10 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP1 pid=669) DEBUG 04-22 00:15:10 [compilation/passes/pass_manager.py:100] Skipping with compile range (6554, 8192) -(Worker_TP1 pid=669) DEBUG 04-22 00:15:10 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=669) DEBUG 04-22 00:15:10 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=669) DEBUG 04-22 00:15:10 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=668) DEBUG 04-22 00:15:10 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=668) DEBUG 04-22 00:15:10 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP0 pid=668) DEBUG 04-22 00:15:10 [compilation/passes/pass_manager.py:100] Skipping with compile range (6554, 8192) -(Worker_TP0 pid=668) DEBUG 04-22 00:15:10 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=668) DEBUG 04-22 00:15:10 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=668) DEBUG 04-22 00:15:10 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=668) INFO 04-22 00:15:11 [compilation/backends.py:372] Cache the graph of compile range (6554, 8192) for later use -(Worker_TP0 pid=668) DEBUG 04-22 00:15:11 [compilation/backends.py:377] Store the 0-th graph for compile range(6554, 8192) from inductor_standalone via handle ('artifact_compile_range_6554_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/14c0de09a2/rank_0_0/backbone/artifact_compile_range_6554_8192_subgraph_0') -(Worker_TP1 pid=669) DEBUG 04-22 00:15:11 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=669) DEBUG 04-22 00:15:11 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.0 ms -(Worker_TP0 pid=668) DEBUG 04-22 00:15:11 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=668) DEBUG 04-22 00:15:11 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.7 ms -(Worker_TP1 pid=669) DEBUG 04-22 00:15:11 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP1 pid=669) DEBUG 04-22 00:15:11 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 41.2 ms -(Worker_TP1 pid=669) DEBUG 04-22 00:15:11 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP1 pid=669) DEBUG 04-22 00:15:11 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP1 pid=669) DEBUG 04-22 00:15:11 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(Worker_TP0 pid=668) DEBUG 04-22 00:15:11 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP0 pid=668) DEBUG 04-22 00:15:11 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 41.6 ms -(Worker_TP0 pid=668) DEBUG 04-22 00:15:11 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP0 pid=668) DEBUG 04-22 00:15:11 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 6 nodes -(Worker_TP0 pid=668) DEBUG 04-22 00:15:11 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.6 ms -(Worker_TP0 pid=668) DEBUG 04-22 00:15:12 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 6553) from inductor_standalone via handle ('artifact_compile_range_1_6553_subgraph_1', '/data/.cache/vllm/torch_compile_cache/14c0de09a2/rank_0_0/backbone/artifact_compile_range_1_6553_subgraph_1') -(Worker_TP1 pid=669) DEBUG 04-22 00:15:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=669) DEBUG 04-22 00:15:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.8 ms -(Worker_TP1 pid=669) DEBUG 04-22 00:15:13 [compilation/passes/pass_manager.py:100] Skipping with compile range (6554, 8192) -(Worker_TP1 pid=669) DEBUG 04-22 00:15:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.5 ms -(Worker_TP1 pid=669) DEBUG 04-22 00:15:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=669) DEBUG 04-22 00:15:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=668) DEBUG 04-22 00:15:13 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=668) DEBUG 04-22 00:15:13 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP0 pid=668) DEBUG 04-22 00:15:13 [compilation/passes/pass_manager.py:100] Skipping with compile range (6554, 8192) -(Worker_TP0 pid=668) DEBUG 04-22 00:15:13 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.4 ms -(Worker_TP0 pid=668) DEBUG 04-22 00:15:13 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=668) DEBUG 04-22 00:15:13 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=668) DEBUG 04-22 00:15:14 [compilation/backends.py:377] Store the 1-th graph for compile range(6554, 8192) from inductor_standalone via handle ('artifact_compile_range_6554_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/14c0de09a2/rank_0_0/backbone/artifact_compile_range_6554_8192_subgraph_1') -(Worker_TP0 pid=668) DEBUG 04-22 00:15:15 [compilation/backends.py:377] Store the 2-th graph for compile range(1, 6553) from inductor_standalone via handle ('artifact_compile_range_1_6553_subgraph_2', '/data/.cache/vllm/torch_compile_cache/14c0de09a2/rank_0_0/backbone/artifact_compile_range_1_6553_subgraph_2') -(Worker_TP0 pid=668) DEBUG 04-22 00:15:15 [compilation/backends.py:377] Store the 2-th graph for compile range(6554, 8192) from inductor_standalone via handle ('artifact_compile_range_6554_8192_subgraph_2', '/data/.cache/vllm/torch_compile_cache/14c0de09a2/rank_0_0/backbone/artifact_compile_range_6554_8192_subgraph_2') -(Worker_TP1 pid=669) DEBUG 04-22 00:15:19 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=669) DEBUG 04-22 00:15:19 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP1 pid=669) DEBUG 04-22 00:15:19 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP1 pid=669) DEBUG 04-22 00:15:19 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 40.1 ms -(Worker_TP1 pid=669) DEBUG 04-22 00:15:19 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP1 pid=669) DEBUG 04-22 00:15:19 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP1 pid=669) DEBUG 04-22 00:15:19 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=668) DEBUG 04-22 00:15:19 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=668) DEBUG 04-22 00:15:19 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.6 ms -(Worker_TP0 pid=668) DEBUG 04-22 00:15:19 [compilation/.../fusion/allreduce_rms_fusion.py:865] Replaced 2 patterns -(Worker_TP0 pid=668) DEBUG 04-22 00:15:19 [compilation/passes/vllm_inductor_pass.py:79] AllReduceFusionPass completed in 40.2 ms -(Worker_TP0 pid=668) DEBUG 04-22 00:15:19 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.2 ms -(Worker_TP0 pid=668) DEBUG 04-22 00:15:19 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 2 nodes, removed 5 nodes -(Worker_TP0 pid=668) DEBUG 04-22 00:15:19 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.4 ms -(Worker_TP0 pid=668) DEBUG 04-22 00:15:19 [compilation/backends.py:377] Store the 40-th graph for compile range(1, 6553) from inductor_standalone via handle ('artifact_compile_range_1_6553_subgraph_40', '/data/.cache/vllm/torch_compile_cache/14c0de09a2/rank_0_0/backbone/artifact_compile_range_1_6553_subgraph_40') -(Worker_TP0 pid=668) INFO 04-22 00:15:19 [compilation/backends.py:390] Compiling a graph for compile range (1, 6553) takes 9.11 s -(Worker_TP1 pid=669) DEBUG 04-22 00:15:19 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP1 pid=669) DEBUG 04-22 00:15:19 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP1 pid=669) DEBUG 04-22 00:15:19 [compilation/passes/pass_manager.py:100] Skipping with compile range (6554, 8192) -(Worker_TP1 pid=669) DEBUG 04-22 00:15:19 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP1 pid=669) DEBUG 04-22 00:15:19 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP1 pid=669) DEBUG 04-22 00:15:19 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP0 pid=668) DEBUG 04-22 00:15:19 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(Worker_TP0 pid=668) DEBUG 04-22 00:15:19 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(Worker_TP0 pid=668) DEBUG 04-22 00:15:19 [compilation/passes/pass_manager.py:100] Skipping with compile range (6554, 8192) -(Worker_TP0 pid=668) DEBUG 04-22 00:15:19 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.3 ms -(Worker_TP0 pid=668) DEBUG 04-22 00:15:19 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(Worker_TP0 pid=668) DEBUG 04-22 00:15:19 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.3 ms -(Worker_TP0 pid=668) DEBUG 04-22 00:15:20 [compilation/backends.py:377] Store the 40-th graph for compile range(6554, 8192) from inductor_standalone via handle ('artifact_compile_range_6554_8192_subgraph_40', '/data/.cache/vllm/torch_compile_cache/14c0de09a2/rank_0_0/backbone/artifact_compile_range_6554_8192_subgraph_40') -(Worker_TP0 pid=668) INFO 04-22 00:15:20 [compilation/backends.py:390] Compiling a graph for compile range (6554, 8192) takes 10.21 s -(APIServer pid=1) DEBUG 04-22 00:15:20 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=668) DEBUG 04-22 00:15:20 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/14c0de09a2/rank_0_0/backbone/computation_graph.py -(Worker_TP0 pid=668) INFO 04-22 00:15:22 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/85f3ac149fb92a20bbf4acfb1db1d6d2441b76e5d6bb81e4e5b7eb41ef3d9031/rank_0_0/model -(Worker_TP0 pid=668) INFO 04-22 00:15:22 [compilation/monitor.py:48] torch.compile took 22.93 s in total -(Worker_TP0 pid=668) INFO 04-22 00:15:23 [compilation/monitor.py:76] Initial profiling/warmup run took 0.63 s -(Worker_TP0 pid=668) INFO 04-22 00:15:28 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP0 pid=668) DEBUG 04-22 00:15:28 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP0 pid=668) INFO 04-22 00:15:28 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_TP1 pid=669) INFO 04-22 00:15:28 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(Worker_TP1 pid=669) DEBUG 04-22 00:15:28 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(Worker_TP1 pid=669) INFO 04-22 00:15:28 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(Worker_TP0 pid=668) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=669) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=669) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=668) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=669) DEBUG 04-22 00:15:29 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=668) DEBUG 04-22 00:15:29 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=669) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=668) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=669) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=669) DEBUG 04-22 00:15:29 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=668) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=668) DEBUG 04-22 00:15:29 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(Worker_TP1 pid=669) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 100.00 MiB first-capture + (51-1) × 24.00 MiB per-graph -(Worker_TP1 pid=669) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=668) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 100.00 MiB first-capture + (51-1) × 24.00 MiB per-graph -(Worker_TP0 pid=668) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=669) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=669) DEBUG 04-22 00:15:29 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=668) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=668) DEBUG 04-22 00:15:29 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=668) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=669) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=669) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=669) DEBUG 04-22 00:15:29 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=668) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=668) DEBUG 04-22 00:15:29 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(Worker_TP0 pid=668) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 136.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(Worker_TP1 pid=669) DEBUG 04-22 00:15:29 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 136.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(Worker_TP0 pid=668) INFO 04-22 00:15:29 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses -(Worker_TP1 pid=669) INFO 04-22 00:15:29 [distributed/device_communicators/custom_all_reduce.py:216] Registering 0 cuda graph addresses -(Worker_TP0 pid=668) DEBUG 04-22 00:15:30 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP0 pid=668) INFO 04-22 00:15:30 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.60 GiB total -(Worker_TP1 pid=669) DEBUG 04-22 00:15:30 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(Worker_TP1 pid=669) INFO 04-22 00:15:30 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 1.60 GiB total -(Worker_TP0 pid=668) DEBUG 04-22 00:15:30 [v1/worker/gpu_worker.py:424] Initial free memory: 77.66 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP0 pid=668) DEBUG 04-22 00:15:30 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.25 GiB (total), 59.82 GiB (within requested) -(Worker_TP0 pid=668) DEBUG 04-22 00:15:30 [v1/worker/gpu_worker.py:435] Memory profiling takes 31.49 seconds. Total non KV cache memory: 16.21GiB; torch peak memory increase: 2.03GiB; non-torch forward increase memory: 2.07GiB; weights memory: 12.11GiB. -(Worker_TP0 pid=668) INFO 04-22 00:15:30 [v1/worker/gpu_worker.py:436] Available KV cache memory: 59.02 GiB -(Worker_TP0 pid=668) INFO 04-22 00:15:30 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9702 to maintain the same effective KV cache size. -(Worker_TP0 pid=668) DEBUG 04-22 00:15:30 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=469) DEBUG 04-22 00:15:30 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=469) DEBUG 04-22 00:15:30 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=669) DEBUG 04-22 00:15:30 [v1/worker/gpu_worker.py:424] Initial free memory: 77.66 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(Worker_TP1 pid=669) DEBUG 04-22 00:15:30 [v1/worker/gpu_worker.py:430] Free memory after profiling: 62.25 GiB (total), 59.82 GiB (within requested) -(Worker_TP1 pid=669) DEBUG 04-22 00:15:30 [v1/worker/gpu_worker.py:435] Memory profiling takes 31.45 seconds. Total non KV cache memory: 16.21GiB; torch peak memory increase: 2.03GiB; non-torch forward increase memory: 2.07GiB; weights memory: 12.11GiB. -(Worker_TP1 pid=669) INFO 04-22 00:15:30 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9702 to maintain the same effective KV cache size. -(Worker_TP1 pid=669) DEBUG 04-22 00:15:30 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=469) DEBUG 04-22 00:15:30 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=469) INFO 04-22 00:15:30 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 773,584 tokens -(EngineCore pid=469) INFO 04-22 00:15:30 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 94.43x -(Worker_TP0 pid=668) DEBUG 04-22 00:15:30 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=669) DEBUG 04-22 00:15:30 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=668) INFO 04-22 00:15:30 [v1/worker/gpu_worker.py:578] Compile and warming up model for size 8192 -(Worker_TP1 pid=669) INFO 04-22 00:15:30 [v1/worker/gpu_worker.py:578] Compile and warming up model for size 8192 -(Worker_TP0 pid=668) DEBUG 04-22 00:15:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=669) DEBUG 04-22 00:15:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=668) 2026-04-22 00:15:30,826 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP1 pid=669) 2026-04-22 00:15:30,826 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(Worker_TP0 pid=668) DEBUG 04-22 00:15:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP1 pid=669) DEBUG 04-22 00:15:30 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(APIServer pid=1) DEBUG 04-22 00:15:30 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(Worker_TP0 pid=668) 2026-04-22 00:15:30,880 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP1 pid=669) 2026-04-22 00:15:30,880 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(Worker_TP1 pid=669) DEBUG 04-22 00:15:31 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(Worker_TP0 pid=668) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00:1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=469) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=469) INFO 04-22 00:15:41 [config/compilation.py:290] Enabled custom fusions: allreduce_rms -(Worker_TP1 pid=669) DEBUG 04-22 00:15:41 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP0 pid=668) DEBUG 04-22 00:15:41 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=469) DEBUG 04-22 00:15:41 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(EngineCore pid=469) DEBUG 04-22 00:15:41 [v1/engine/core.py:1158] EngineCore waiting for work. -(EngineCore pid=469) DEBUG 04-22 00:15:41 [v1/engine/core.py:1158] EngineCore waiting for work. -(APIServer pid=1) INFO 04-22 00:15:41 [entrypoints/openai/api_server.py:590] Supported tasks: ['generate'] -(APIServer pid=1) DEBUG 04-22 00:15:42 [renderers/base.py:197] Warming up chat template processing... -(Worker_TP0 pid=668) DEBUG 04-22 00:15:42 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(Worker_TP1 pid=669) DEBUG 04-22 00:15:42 [distributed/device_communicators/shm_broadcast.py:191] Poller received notify event -(APIServer pid=1) The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. -(APIServer pid=1) INFO 04-22 00:15:43 [renderers/hf.py:314] Detected the chat template content format to be 'openai'. You can set `--chat-template-content-format` to override this. -(APIServer pid=1) DEBUG 04-22 00:15:43 [renderers/base.py:203] Chat template warmup completed in 1.712s -(APIServer pid=1) DEBUG 04-22 00:15:43 [renderers/base.py:218] Warming up multi-modal processing... -(APIServer pid=1) The tokenizer you are loading from 'RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue. -(APIServer pid=1) INFO 04-22 00:15:46 [renderers/base.py:231] Multi-modal warmup completed in 2.956s -(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/openai/api_server.py:594] Starting vLLM server on http://0.0.0.0:8000 -(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:37] Available routes are: -(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /openapi.json, Methods: HEAD, GET -(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /docs, Methods: HEAD, GET -(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /docs/oauth2-redirect, Methods: HEAD, GET -(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /redoc, Methods: HEAD, GET -(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /tokenize, Methods: POST -(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /detokenize, Methods: POST -(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /load, Methods: GET -(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /version, Methods: GET -(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /health, Methods: GET -(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /metrics, Methods: GET -(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /v1/models, Methods: GET -(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /ping, Methods: GET -(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /ping, Methods: POST -(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /invocations, Methods: POST -(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /v1/chat/completions, Methods: POST -(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /v1/chat/completions/batch, Methods: POST -(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /v1/responses, Methods: POST -(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}, Methods: GET -(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /v1/responses/{response_id}/cancel, Methods: POST -(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /v1/completions, Methods: POST -(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /v1/messages, Methods: POST -(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /v1/messages/count_tokens, Methods: POST -(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /inference/v1/generate, Methods: POST -(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /scale_elastic_ep, Methods: POST -(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /is_scaling_elastic_ep, Methods: POST -(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /v1/chat/completions/render, Methods: POST -(APIServer pid=1) INFO 04-22 00:15:46 [entrypoints/launcher.py:46] Route: /v1/completions/render, Methods: POST -(APIServer pid=1) INFO: Started server process [1] -(APIServer pid=1) INFO: Waiting for application startup. -(APIServer pid=1) INFO: Application startup complete. -(APIServer pid=1) DEBUG 04-22 00:15:51 [v1/engine/async_llm.py:875] Called check_health. -(APIServer pid=1) INFO: 10.129.8.2:59896 - "GET /health HTTP/1.1" 200 OK diff --git a/accuracy/results/v0.19.0/logs/w8a8-redhatai-llam--h100-80gb--tp1pp1dp1--8192-dtf16.log b/accuracy/results/v0.19.0/logs/w8a8-redhatai-llam--h100-80gb--tp1pp1dp1--8192-dtf16.log deleted file mode 100644 index 37751963..00000000 --- a/accuracy/results/v0.19.0/logs/w8a8-redhatai-llam--h100-80gb--tp1pp1dp1--8192-dtf16.log +++ /dev/null @@ -1,878 +0,0 @@ -DEBUG 04-22 00:09:38 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:09:38 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:09:38 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:09:38 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:09:38 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:09:38 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:09:38 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:09:38 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:09:38 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:09:38 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:09:38 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:09:38 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:09:42 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -DEBUG 04-22 00:09:44 [entrypoints/utils.py:170] Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn' -DEBUG 04-22 00:09:44 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -DEBUG 04-22 00:09:44 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:09:44 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:09:44 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 00:09:44 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:09:44 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 00:09:44 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 00:09:44 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8 -(APIServer pid=1) INFO 04-22 00:09:44 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 00:09:44 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:09:44 [entrypoints/utils.py:233] non-default args: {'model_tag': 'RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8', 'model': 'RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8', 'dtype': 'float16', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 00:09:44 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 00:09:45 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.llama.LlamaForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 00:09:45 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0003835 secs -(APIServer pid=1) INFO 04-22 00:09:45 [config/model.py:549] Resolved architecture: LlamaForCausalLM -(APIServer pid=1) WARNING 04-22 00:09:45 [config/model.py:2016] Casting torch.bfloat16 to torch.float16. -(APIServer pid=1) INFO 04-22 00:09:45 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 00:09:45 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 00:09:45 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 00:09:45 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 00:09:45 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 00:09:45 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 00:09:45 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 00:09:45 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 00:09:45 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 00:09:45 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:09:46 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:09:46 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 00:09:49 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:09:49 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:09:49 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:09:49 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:09:49 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:09:49 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:09:49 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:09:49 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:09:49 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:09:49 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:09:49 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:09:49 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:09:54 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=243) DEBUG 04-22 00:09:56 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 00:09:56 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=243) DEBUG 04-22 00:09:56 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/1f7316b6-80e0-4336-88b3-183484609596'], outputs=['ipc:///tmp/c432d9a3-9551-4a4e-816f-13a218d6bd75'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=243) DEBUG 04-22 00:09:56 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=243) DEBUG 04-22 00:09:56 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=243) DEBUG 04-22 00:09:56 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=243) DEBUG 04-22 00:09:56 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=243) DEBUG 04-22 00:09:56 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=243) INFO 04-22 00:09:56 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8', speculative_config=None, tokenizer='RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=243) DEBUG 04-22 00:09:56 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.4.188:32907 backend=nccl -(EngineCore pid=243) INFO 04-22 00:09:56 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.128.4.188:32907 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=243) DEBUG 04-22 00:09:56 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=243) INFO 04-22 00:09:56 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=243) DEBUG 04-22 00:09:57 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776816597.0954285, auto_measure=True -(EngineCore pid=243) DEBUG 04-22 00:09:57 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=243) DEBUG 04-22 00:09:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=243) DEBUG 04-22 00:09:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=243) DEBUG 04-22 00:09:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=243) DEBUG 04-22 00:09:57 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=243) DEBUG 04-22 00:09:57 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=243) DEBUG 04-22 00:09:57 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=243) INFO 04-22 00:09:57 [v1/worker/gpu_model_runner.py:4735] Starting to load model RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8... -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.0.self_attn.qkv_proj -(EngineCore pid=243) INFO 04-22 00:09:57 [model_executor/.../linear/__init__.py:291] Selected CutlassInt8ScaledMMLinearKernel for CompressedTensorsW8A8Int8 -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.0.self_attn.o_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.float16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=243) INFO 04-22 00:09:57 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=243) INFO 04-22 00:09:57 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.0.mlp.gate_up_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.0.mlp.down_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.1.self_attn.qkv_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.1.self_attn.o_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.1.mlp.gate_up_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.1.mlp.down_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.2.self_attn.qkv_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.2.self_attn.o_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.2.mlp.gate_up_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.2.mlp.down_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.3.self_attn.qkv_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.3.self_attn.o_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.3.mlp.gate_up_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.3.mlp.down_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.4.self_attn.qkv_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.4.self_attn.o_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.4.mlp.gate_up_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.4.mlp.down_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.5.self_attn.qkv_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.5.self_attn.o_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.5.mlp.gate_up_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.5.mlp.down_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.6.self_attn.qkv_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.6.self_attn.o_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.6.mlp.gate_up_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.6.mlp.down_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.7.self_attn.qkv_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.7.self_attn.o_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.7.mlp.gate_up_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.7.mlp.down_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.8.self_attn.qkv_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.8.self_attn.o_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.8.mlp.gate_up_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.8.mlp.down_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.9.self_attn.qkv_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.9.self_attn.o_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.9.mlp.gate_up_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.9.mlp.down_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.10.self_attn.qkv_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.10.self_attn.o_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.10.mlp.gate_up_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.10.mlp.down_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.11.self_attn.qkv_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.11.self_attn.o_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.11.mlp.gate_up_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.11.mlp.down_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.12.self_attn.qkv_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.12.self_attn.o_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.12.mlp.gate_up_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.12.mlp.down_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.13.self_attn.qkv_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.13.self_attn.o_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.13.mlp.gate_up_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.13.mlp.down_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.14.self_attn.qkv_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.14.self_attn.o_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.14.mlp.gate_up_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.14.mlp.down_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.15.self_attn.qkv_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.15.self_attn.o_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.15.mlp.gate_up_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.15.mlp.down_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.16.self_attn.qkv_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.16.self_attn.o_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.16.mlp.gate_up_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.16.mlp.down_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.17.self_attn.qkv_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.17.self_attn.o_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.17.mlp.gate_up_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.17.mlp.down_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.18.self_attn.qkv_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.18.self_attn.o_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.18.mlp.gate_up_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.18.mlp.down_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.19.self_attn.qkv_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.19.self_attn.o_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.19.mlp.gate_up_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.19.mlp.down_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.20.self_attn.qkv_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.20.self_attn.o_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.20.mlp.gate_up_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.20.mlp.down_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.21.self_attn.qkv_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.21.self_attn.o_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.21.mlp.gate_up_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.21.mlp.down_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.22.self_attn.qkv_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.22.self_attn.o_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.22.mlp.gate_up_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.22.mlp.down_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.23.self_attn.qkv_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.23.self_attn.o_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.23.mlp.gate_up_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.23.mlp.down_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.24.self_attn.qkv_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.24.self_attn.o_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.24.mlp.gate_up_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.24.mlp.down_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.25.self_attn.qkv_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.25.self_attn.o_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.25.mlp.gate_up_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.25.mlp.down_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.26.self_attn.qkv_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.26.self_attn.o_proj -(EngineCore pid=243) DEBUG 04-22 00:09:57 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.26.mlp.gate_up_proj -(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.26.mlp.down_proj -(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.27.self_attn.qkv_proj -(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.27.self_attn.o_proj -(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.27.mlp.gate_up_proj -(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.27.mlp.down_proj -(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.28.self_attn.qkv_proj -(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.28.self_attn.o_proj -(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.28.mlp.gate_up_proj -(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.28.mlp.down_proj -(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.29.self_attn.qkv_proj -(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.29.self_attn.o_proj -(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.29.mlp.gate_up_proj -(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.29.mlp.down_proj -(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.30.self_attn.qkv_proj -(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.30.self_attn.o_proj -(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.30.mlp.gate_up_proj -(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.30.mlp.down_proj -(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.31.self_attn.qkv_proj -(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.31.self_attn.o_proj -(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.31.mlp.gate_up_proj -(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.31.mlp.down_proj -(EngineCore pid=243) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=243) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=243) DEBUG 04-22 00:09:58 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=243) DEBUG 04-22 00:09:58 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=243) DEBUG 04-22 00:09:58 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 65, 'silu_and_mul': 32, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=243) DEBUG 04-22 00:09:58 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00001-of-00002.safetensors', 'model-00002-of-00002.safetensors']] -(EngineCore pid=243) Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00 -(APIServer pid=1) DEBUG 04-22 00:10:06 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/llama.py -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=243) INFO 04-22 00:10:11 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/89d9e052d6/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=0a3183d67c comp=e546579c48 code=00c436b0deda272393bbd56b49f2a57f076817aa66601ed815250cff678fbbf0 dir=/data/.cache/vllm/torch_compile_cache/89d9e052d6/rank_0_0/backbone -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/backends.py:1074] Vllm config hash: 0a3183d67c -(EngineCore pid=243) INFO 04-22 00:10:11 [compilation/backends.py:1111] Dynamo bytecode transform time: 5.62 s -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=243) DEBUG 04-22 00:10:11 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=243) DEBUG 04-22 00:10:12 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 00:10:12 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.4 ms -(EngineCore pid=243) DEBUG 04-22 00:10:12 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.0 ms -(EngineCore pid=243) DEBUG 04-22 00:10:12 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 00:10:12 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=243) INFO 04-22 00:10:14 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=243) DEBUG 04-22 00:10:14 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/89d9e052d6/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=243) DEBUG 04-22 00:10:15 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 00:10:15 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(EngineCore pid=243) DEBUG 04-22 00:10:15 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.6 ms -(EngineCore pid=243) DEBUG 04-22 00:10:15 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 00:10:15 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(APIServer pid=1) DEBUG 04-22 00:10:16 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=243) DEBUG 04-22 00:10:16 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/89d9e052d6/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=243) DEBUG 04-22 00:10:17 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=243) DEBUG 04-22 00:10:17 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.2 ms -(EngineCore pid=243) DEBUG 04-22 00:10:17 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.3 ms -(EngineCore pid=243) DEBUG 04-22 00:10:17 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=243) DEBUG 04-22 00:10:17 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(EngineCore pid=243) DEBUG 04-22 00:10:18 [compilation/backends.py:377] Store the 32-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_32', '/data/.cache/vllm/torch_compile_cache/89d9e052d6/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_32') -(EngineCore pid=243) INFO 04-22 00:10:18 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 6.40 s -(EngineCore pid=243) DEBUG 04-22 00:10:18 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/89d9e052d6/rank_0_0/backbone/computation_graph.py -(EngineCore pid=243) INFO 04-22 00:10:19 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/0d0d81f25d4664a542cae9b3d62552b300cbbc433ae2ccda7e244ce4a0a6c9d8/rank_0_0/model -(EngineCore pid=243) INFO 04-22 00:10:19 [compilation/monitor.py:48] torch.compile took 13.43 s in total -(EngineCore pid=243) INFO 04-22 00:10:19 [compilation/monitor.py:76] Initial profiling/warmup run took 0.33 s -(EngineCore pid=243) INFO 04-22 00:10:25 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=243) DEBUG 04-22 00:10:25 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=243) INFO 04-22 00:10:25 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=243) DEBUG 04-22 00:10:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:10:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:10:25 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 00:10:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:10:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:10:25 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 00:10:25 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 112.00 MiB first-capture + (51-1) × 8.00 MiB per-graph -(EngineCore pid=243) DEBUG 04-22 00:10:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:10:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:10:25 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 00:10:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:10:25 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) DEBUG 04-22 00:10:25 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=243) DEBUG 04-22 00:10:25 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 262.00 MiB first-capture + (51-1) × 4.00 MiB per-graph -(APIServer pid=1) DEBUG 04-22 00:10:26 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=243) DEBUG 04-22 00:10:26 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=243) INFO 04-22 00:10:26 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.84 GiB total -(EngineCore pid=243) DEBUG 04-22 00:10:26 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=243) DEBUG 04-22 00:10:26 [v1/worker/gpu_worker.py:430] Free memory after profiling: 69.73 GiB (total), 66.28 GiB (within requested) -(EngineCore pid=243) DEBUG 04-22 00:10:26 [v1/worker/gpu_worker.py:435] Memory profiling takes 20.90 seconds. Total non KV cache memory: 10.63GiB; torch peak memory increase: 1.89GiB; non-torch forward increase memory: 0.25GiB; weights memory: 8.49GiB. -(EngineCore pid=243) INFO 04-22 00:10:26 [v1/worker/gpu_worker.py:436] Available KV cache memory: 64.6 GiB -(EngineCore pid=243) INFO 04-22 00:10:26 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9606 to maintain the same effective KV cache size. -(EngineCore pid=243) INFO 04-22 00:10:26 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 529,200 tokens -(EngineCore pid=243) INFO 04-22 00:10:26 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 64.60x -(EngineCore pid=243) 2026-04-22 00:10:26,778 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=243) DEBUG 04-22 00:10:26 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=243) 2026-04-22 00:10:26,789 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=243) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -DEBUG 04-22 00:21:00 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -DEBUG 04-22 00:21:00 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(APIServer pid=1) INFO 04-22 00:21:00 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:21:00 [entrypoints/utils.py:299] █ █ █▄ ▄█ -(APIServer pid=1) INFO 04-22 00:21:00 [entrypoints/utils.py:299] ▄▄ ▄█ █ █ █ ▀▄▀ █ version 0.19.0 -(APIServer pid=1) INFO 04-22 00:21:00 [entrypoints/utils.py:299] █▄█▀ █ █ █ █ model RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8 -(APIServer pid=1) INFO 04-22 00:21:00 [entrypoints/utils.py:299] ▀▀ ▀▀▀▀▀ ▀▀▀▀▀ ▀ ▀ -(APIServer pid=1) INFO 04-22 00:21:00 [entrypoints/utils.py:299] -(APIServer pid=1) INFO 04-22 00:21:00 [entrypoints/utils.py:233] non-default args: {'model_tag': 'RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8', 'model': 'RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8', 'max_model_len': 8192, 'gpu_memory_utilization': 0.95, 'enable_prefix_caching': False} -(APIServer pid=1) WARNING 04-22 00:21:00 [envs.py:1744] Unknown vLLM environment variable detected: VLLM_ATTENTION_BACKEND -(APIServer pid=1) DEBUG 04-22 00:21:01 [model_executor/models/registry.py:827] Loaded model info for class vllm.model_executor.models.qwen2.Qwen2ForCausalLM from cache -(APIServer pid=1) DEBUG 04-22 00:21:01 [logging_utils/log_time.py:29] Registry inspect model class: Elapsed time 0.0026569 secs -(APIServer pid=1) INFO 04-22 00:21:01 [config/model.py:549] Resolved architecture: Qwen2ForCausalLM -(APIServer pid=1) INFO 04-22 00:21:01 [config/model.py:1678] Using max model len 8192 -(APIServer pid=1) DEBUG 04-22 00:21:01 [config/model.py:1743] Generative models support chunked prefill. -(APIServer pid=1) DEBUG 04-22 00:21:01 [config/model.py:1801] Generative models support prefix caching. -(APIServer pid=1) DEBUG 04-22 00:21:01 [engine/arg_utils.py:2116] Enabling chunked prefill by default -(APIServer pid=1) DEBUG 04-22 00:21:01 [engine/arg_utils.py:2232] Defaulting max_num_batched_tokens to 8192 for OPENAI_API_SERVER usage context. -(APIServer pid=1) DEBUG 04-22 00:21:01 [engine/arg_utils.py:2242] Defaulting max_num_seqs to 1024 for OPENAI_API_SERVER usage context. -(APIServer pid=1) INFO 04-22 00:21:01 [config/scheduler.py:238] Chunked prefill is enabled with max_num_batched_tokens=8192. -(APIServer pid=1) INFO 04-22 00:21:01 [config/vllm.py:790] Asynchronous scheduling is enabled. -(APIServer pid=1) DEBUG 04-22 00:21:01 [plugins/__init__.py:36] No plugins for group vllm.stat_logger_plugins found. -(APIServer pid=1) DEBUG 04-22 00:21:01 [tokenizers/registry.py:68] Loading CachedHfTokenizer for tokenizer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:21:02 [renderers/registry.py:57] Loading HfRenderer for renderer_mode='hf' -(APIServer pid=1) DEBUG 04-22 00:21:02 [plugins/io_processors/__init__.py:36] No IOProcessor plugins requested by the model -DEBUG 04-22 00:21:05 [plugins/__init__.py:36] No plugins for group vllm.platform_plugins found. -DEBUG 04-22 00:21:05 [platforms/__init__.py:37] Checking if TPU platform is available. -DEBUG 04-22 00:21:05 [platforms/__init__.py:56] TPU platform is not available because: No module named 'libtpu' -DEBUG 04-22 00:21:05 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:21:05 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:21:05 [platforms/__init__.py:113] Checking if ROCm platform is available. -DEBUG 04-22 00:21:05 [platforms/__init__.py:127] ROCm platform is not available because: No module named 'amdsmi' -DEBUG 04-22 00:21:05 [platforms/__init__.py:134] Checking if XPU platform is available. -DEBUG 04-22 00:21:05 [platforms/__init__.py:165] Checking if CPU platform is available. -DEBUG 04-22 00:21:05 [platforms/__init__.py:62] Checking if CUDA platform is available. -DEBUG 04-22 00:21:05 [platforms/__init__.py:85] Confirmed CUDA platform is available. -DEBUG 04-22 00:21:05 [platforms/__init__.py:247] Automatically detected platform cuda. -DEBUG 04-22 00:21:10 [utils/import_utils.py:74] Loading module triton_kernels from /usr/local/lib/python3.12/dist-packages/vllm/third_party/triton_kernels/__init__.py. -(EngineCore pid=244) DEBUG 04-22 00:21:12 [v1/engine/core.py:1018] Waiting for init message from front-end. -(APIServer pid=1) DEBUG 04-22 00:21:12 [v1/engine/utils.py:1158] HELLO from local core engine process 0. -(EngineCore pid=244) DEBUG 04-22 00:21:12 [v1/engine/core.py:1029] Received init message: EngineHandshakeMetadata(addresses=EngineZmqAddresses(inputs=['ipc:///tmp/37ef579d-35a8-4fe4-9b49-bd08c5456acc'], outputs=['ipc:///tmp/69aed467-8552-41bb-97fc-aef0d67a2515'], coordinator_input=None, coordinator_output=None, frontend_stats_publish_address=None), parallel_config={}) -(EngineCore pid=244) DEBUG 04-22 00:21:12 [v1/engine/core.py:826] Has DP Coordinator: False, stats publish address: None -(EngineCore pid=244) DEBUG 04-22 00:21:12 [plugins/__init__.py:44] Available plugins for group vllm.general_plugins: -(EngineCore pid=244) DEBUG 04-22 00:21:12 [plugins/__init__.py:46] - lora_filesystem_resolver -> vllm.plugins.lora_resolvers.filesystem_resolver:register_filesystem_resolver -(EngineCore pid=244) DEBUG 04-22 00:21:12 [plugins/__init__.py:46] - lora_hf_hub_resolver -> vllm.plugins.lora_resolvers.hf_hub_resolver:register_hf_hub_resolver -(EngineCore pid=244) DEBUG 04-22 00:21:12 [plugins/__init__.py:49] All plugins in this group will be loaded. Set `VLLM_PLUGINS` to control which plugins to load. -(EngineCore pid=244) INFO 04-22 00:21:12 [v1/engine/core.py:105] Initializing a V1 LLM engine (v0.19.0) with config: model='RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8', speculative_config=None, tokenizer='RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=compressed-tensors, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8, enable_prefix_caching=False, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': , 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'cudagraph_mm_encoder': False, 'encoder_cudagraph_token_budgets': [], 'encoder_cudagraph_max_images_per_batch': 0, 'compile_sizes': [], 'compile_ranges_endpoints': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'size_asserts': True, 'alignment_asserts': True, 'scalar_asserts': True, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': , 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': , 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []} -(EngineCore pid=244) DEBUG 04-22 00:21:12 [distributed/parallel_state.py:1356] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.26:38023 backend=nccl -(EngineCore pid=244) INFO 04-22 00:21:12 [distributed/parallel_state.py:1400] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.129.9.26:38023 backend=nccl -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=244) DEBUG 04-22 00:21:12 [distributed/parallel_state.py:1459] Detected 1 nodes in the distributed environment -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 -(EngineCore pid=244) INFO 04-22 00:21:12 [distributed/parallel_state.py:1716] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A -(EngineCore pid=244) DEBUG 04-22 00:21:13 [v1/worker/gpu_worker.py:284] worker init memory snapshot: torch_peak=0.0GiB, free_memory=78.68GiB, total_memory=79.19GiB, cuda_memory=0.51GiB, torch_memory=0.0GiB, non_torch_memory=0.51GiB, timestamp=1776817273.1488597, auto_measure=True -(EngineCore pid=244) DEBUG 04-22 00:21:13 [v1/worker/gpu_worker.py:285] worker requested memory: 75.23GiB -(EngineCore pid=244) DEBUG 04-22 00:21:13 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=244) DEBUG 04-22 00:21:13 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'hidden_states', 'input_embeds'] -(EngineCore pid=244) DEBUG 04-22 00:21:13 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['input_ids', 'positions', 'intermediate_tensors', 'inputs_embeds'] -(EngineCore pid=244) DEBUG 04-22 00:21:13 [compilation/decorators.py:213] Inferred dynamic dimensions for forward method of : ['num_tokens_no_spec', 'token_ids_gpu', 'combined_mask'] -(EngineCore pid=244) DEBUG 04-22 00:21:13 [v1/sample/ops/topk_topp_sampler.py:57] FlashInfer top-p/top-k sampling is available but disabled by default. Set VLLM_USE_FLASHINFER_SAMPLER=1 to opt in after verifying accuracy for your workloads. -(EngineCore pid=244) DEBUG 04-22 00:21:13 [v1/sample/logits_processor/__init__.py:65] No logitsprocs plugins installed (group vllm.logits_processors). -(EngineCore pid=244) DEBUG 04-22 00:21:13 [model_executor/offloader/base.py:107] Offloader set to NoopOffloader (no offloading). -(EngineCore pid=244) INFO 04-22 00:21:13 [v1/worker/gpu_model_runner.py:4735] Starting to load model RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8... -(EngineCore pid=244) DEBUG 04-22 00:21:13 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.0.self_attn.qkv_proj -(EngineCore pid=244) INFO 04-22 00:21:13 [model_executor/.../linear/__init__.py:291] Selected CutlassInt8ScaledMMLinearKernel for CompressedTensorsW8A8Int8 -(EngineCore pid=244) DEBUG 04-22 00:21:13 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.0.self_attn.o_proj -(EngineCore pid=244) DEBUG 04-22 00:21:13 [platforms/cuda.py:293] Some attention backends are not valid for cuda with AttentionSelectorConfig(head_size=128, dtype=torch.bfloat16, kv_cache_dtype=auto, block_size=None, use_mla=False, has_sink=False, use_sparse=False, use_mm_prefix=False, use_per_head_quant_scales=False, attn_type=AttentionType.DECODER). Reasons: {}. -(EngineCore pid=244) INFO 04-22 00:21:13 [platforms/cuda.py:334] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION']. -(EngineCore pid=244) INFO 04-22 00:21:13 [v1/attention/backends/flash_attn.py:596] Using FlashAttention version 3 -(EngineCore pid=244) DEBUG 04-22 00:21:13 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.0.mlp.gate_up_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.0.mlp.down_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.1.self_attn.qkv_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.1.self_attn.o_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.1.mlp.gate_up_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.1.mlp.down_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.2.self_attn.qkv_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.2.self_attn.o_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.2.mlp.gate_up_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.2.mlp.down_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.3.self_attn.qkv_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.3.self_attn.o_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.3.mlp.gate_up_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.3.mlp.down_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.4.self_attn.qkv_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.4.self_attn.o_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.4.mlp.gate_up_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.4.mlp.down_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.5.self_attn.qkv_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.5.self_attn.o_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.5.mlp.gate_up_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.5.mlp.down_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.6.self_attn.qkv_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.6.self_attn.o_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.6.mlp.gate_up_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.6.mlp.down_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.7.self_attn.qkv_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.7.self_attn.o_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.7.mlp.gate_up_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.7.mlp.down_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.8.self_attn.qkv_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.8.self_attn.o_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.8.mlp.gate_up_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.8.mlp.down_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.9.self_attn.qkv_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.9.self_attn.o_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.9.mlp.gate_up_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.9.mlp.down_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.10.self_attn.qkv_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.10.self_attn.o_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.10.mlp.gate_up_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.10.mlp.down_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.11.self_attn.qkv_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.11.self_attn.o_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.11.mlp.gate_up_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.11.mlp.down_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.12.self_attn.qkv_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.12.self_attn.o_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.12.mlp.gate_up_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.12.mlp.down_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.13.self_attn.qkv_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.13.self_attn.o_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.13.mlp.gate_up_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.13.mlp.down_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.14.self_attn.qkv_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.14.self_attn.o_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.14.mlp.gate_up_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.14.mlp.down_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.15.self_attn.qkv_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.15.self_attn.o_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.15.mlp.gate_up_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.15.mlp.down_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.16.self_attn.qkv_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.16.self_attn.o_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.16.mlp.gate_up_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.16.mlp.down_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.17.self_attn.qkv_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.17.self_attn.o_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.17.mlp.gate_up_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.17.mlp.down_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.18.self_attn.qkv_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.18.self_attn.o_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.18.mlp.gate_up_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.18.mlp.down_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.19.self_attn.qkv_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.19.self_attn.o_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.19.mlp.gate_up_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.19.mlp.down_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.20.self_attn.qkv_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.20.self_attn.o_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.20.mlp.gate_up_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.20.mlp.down_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.21.self_attn.qkv_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.21.self_attn.o_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.21.mlp.gate_up_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.21.mlp.down_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.22.self_attn.qkv_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.22.self_attn.o_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.22.mlp.gate_up_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.22.mlp.down_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.23.self_attn.qkv_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.23.self_attn.o_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.23.mlp.gate_up_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.23.mlp.down_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.24.self_attn.qkv_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.24.self_attn.o_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.24.mlp.gate_up_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.24.mlp.down_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.25.self_attn.qkv_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.25.self_attn.o_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.25.mlp.gate_up_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.25.mlp.down_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.26.self_attn.qkv_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.26.self_attn.o_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.26.mlp.gate_up_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.26.mlp.down_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.27.self_attn.qkv_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.27.self_attn.o_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.27.mlp.gate_up_proj -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/.../compressed_tensors/compressed_tensors.py:759] Using scheme: CompressedTensorsW8A8Int8 for model.layers.27.mlp.down_proj -(EngineCore pid=244) :1301: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead. -(EngineCore pid=244) :1301: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead. -(EngineCore pid=244) DEBUG 04-22 00:21:14 [compilation/backends.py:101] Using InductorStandaloneAdaptor -(EngineCore pid=244) DEBUG 04-22 00:21:14 [config/compilation.py:1194] enabled custom ops: Counter() -(EngineCore pid=244) DEBUG 04-22 00:21:14 [config/compilation.py:1195] disabled custom ops: Counter({'rms_norm': 57, 'silu_and_mul': 28, 'vocab_parallel_embedding': 1, 'rotary_embedding': 1, 'apply_rotary_emb': 1, 'parallel_lm_head': 1, 'logits_processor': 1}) -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/model_loader/base_loader.py:63] Loading weights on cuda ... -(EngineCore pid=244) DEBUG 04-22 00:21:14 [model_executor/model_loader/weight_utils.py:557] Using model weights format [['model-00002-of-00002.safetensors', 'model-00001-of-00002.safetensors']] -(EngineCore pid=244) Loading safetensors checkpoint shards: 0% Completed | 0/2 [00:00 -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] Traced files (to be considered for compilation cache): -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/__init__.py -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/builtins.py -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/_dynamo/polyfills/itertools.py -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/torch/nn/modules/container.py -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/_custom_ops.py -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/custom_op.py -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/ScaledMMLinearKernel.py -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/activation.py -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/attention/attention.py -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/layernorm.py -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/linear.py -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/base.py -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/rotary_embedding/common.py -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/interfaces.py -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/qwen2.py -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:995] /usr/local/lib/python3.12/dist-packages/vllm/platforms/interface.py -(EngineCore pid=244) INFO 04-22 00:21:28 [compilation/backends.py:1051] Using cache directory: /data/.cache/vllm/torch_compile_cache/e6437cffcc/rank_0_0/backbone for vLLM's torch.compile -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1063] torch.compile cache factors: env=75f21ac68beac6c5fde9e2342596ca1c49109ca8862043c2fd0e3619eaf3ccd8 cfg=dcef09dcfb comp=e546579c48 code=7251f1a70adc678d269098238fd40a04d778c50abe51e8dae45473a028705e7a dir=/data/.cache/vllm/torch_compile_cache/e6437cffcc/rank_0_0/backbone -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] Compile env factors (raw): -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] {'CMAKE_BUILD_TYPE': None, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'CUDA_HOME': None, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'K_SCALE_CONSTANT': 200, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'NVCC_THREADS': None, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'Q_SCALE_CONSTANT': 200, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES': None, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES': None, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR': None, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_RBLN_RT_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES': None, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS': None, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VERBOSE': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ALLOW_CHUNKED_LOCAL_ATTN_WITH_HYBRID_KV_CACHE': True, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ALLOW_INSECURE_SERIALIZATION': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ALLOW_LONG_MAX_MODEL_LEN': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ALLOW_RUNTIME_LORA_UPDATING': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_FLASHINFER': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ALLREDUCE_USE_SYMM_MEM': True, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_API_KEY': None, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_BATCH_INVARIANT': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_BLOCKSCALE_FP8_GEMM_FLASHINFER': True, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_COMPILE_CACHE_SAVE_FORMAT': 'binary', -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_COMPUTE_NANS_IN_LOGITS': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_CONFIGURE_LOGGING': True, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_CONFIG_ROOT': '/data/.config/vllm', -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_CPU_NUM_OF_RESERVED_CPU': None, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_CPU_OMP_THREADS_BIND': 'auto', -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_CPU_SGL_KERNEL': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_CUDART_SO_PATH': None, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_CUSTOM_SCOPES_FOR_PROFILING': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DBO_COMM_SMS': 20, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DEBUG_MFU_METRICS': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DEBUG_WORKSPACE': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DEEPEPLL_NVFP4_DISPATCH': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DEEPEP_BUFFER_SIZE_MB': 1024, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DEEP_GEMM_WARMUP': 'relax', -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DISABLED_KERNELS': (), -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DISABLE_COMPILE_CACHE': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DISABLE_LOG_LOGO': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DISABLE_PYNCCL': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DISABLE_REQUEST_ID_RANDOMIZATION': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DISABLE_SHARED_EXPERTS_STREAM': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DOCKER_BUILD_CONTEXT': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DP_RANK': 0, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DP_RANK_LOCAL': 0, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_DP_SIZE': 1, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_DRAIN_REQUESTS': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ELASTIC_EP_SCALE_UP_LAUNCH': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ENABLE_CUDAGRAPH_GC': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ENABLE_FLA_PACKED_RECURRENT_DECODE': True, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_COORDINATE_DESCENT_TUNING': True, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ENABLE_INDUCTOR_MAX_AUTOTUNE': True, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ENABLE_MOE_DP_CHUNK': True, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ENABLE_PREGRAD_PASSES': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ENABLE_RESPONSES_API_STORE': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ENGINE_READY_TIMEOUT_S': 600, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_BACKEND': 'auto', -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB': (), -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_FLASHINFER_MOE_BACKEND': 'latency', -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_FLASHINFER_WORKSPACE_BUFFER_SIZE': 413138944, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_FLOAT32_MATMUL_PRECISION': 'highest', -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_GC_DEBUG': '', -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_GPT_OSS_HARMONY_SYSTEM_INSTRUCTIONS': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_GPT_OSS_SYSTEM_TOOL_MCP_LABELS': (), -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_HAS_FLASHINFER_CUBIN': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_KV_CACHE_LAYOUT': None, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_KV_EVENTS_USE_INT_BLOCK_HASHES': True, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_LOG_BATCHSIZE_INTERVAL': -1.0, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_LOG_MODEL_INSPECTION': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_LOOPBACK_IP': '', -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_LORA_DISABLE_PDL': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_CACHE_DIR': None, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_LORA_RESOLVER_HF_REPO_LIST': None, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MAIN_CUDA_VERSION': '12.9', -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MARLIN_INPUT_DTYPE': None, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MARLIN_USE_ATOMIC_ADD': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MAX_N_SEQUENCES': 16384, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE': 163840, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MLA_DISABLE': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MM_HASHER_ALGORITHM': 'blake3', -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MOE_DP_CHUNK_SIZE': 256, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MOE_ROUTING_SIMULATION_STRATEGY': '', -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MOE_USE_DEEP_GEMM': True, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MOONCAKE_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MOONCAKE_BOOTSTRAP_PORT': 8998, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MORIIO_CONNECTOR_READ_MODE': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MORIIO_NUM_WORKERS': 1, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MORIIO_POST_BATCH_SIZE': -1, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MORIIO_QP_PER_TRANSFER': 1, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MQ_MAX_CHUNK_BYTES_MB': 16, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MSGPACK_ZERO_COPY_THRESHOLD': 256, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_MXFP4_USE_MARLIN': None, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_NCCL_INCLUDE_PATH': None, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_NCCL_SO_PATH': None, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_NIXL_ABORT_REQUEST_TIMEOUT': 480, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_NIXL_EP_MAX_NUM_RANKS': 32, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_HOST': 'localhost', -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_NIXL_SIDE_CHANNEL_PORT': 5600, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_NVFP4_GEMM_BACKEND': None, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_NVTX_SCOPES_FOR_PROFILING': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_PATTERN_MATCH_DEBUG': None, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_PLUGINS': None, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_PP_LAYER_PARTITION': None, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_PROCESS_NAME_PREFIX': 'VLLM', -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_RAY_BUNDLE_INDICES': '', -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_RAY_DP_PACK_STRATEGY': 'strict', -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VARS_TO_COPY': '', -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_RAY_EXTRA_ENV_VAR_PREFIXES_TO_COPY': '', -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_RAY_PER_WORKER_GPUS': 1.0, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_MFMA_PAGE_ATTN': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_FP8_PADDING': True, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_MOE_PADDING': True, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16': True, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB': None, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_QUICK_REDUCE_QUANTIZATION': 'NONE', -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_SLEEP_MEM_CHUNK_SIZE': 256, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4BMM': True, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP4_ASM_GEMM': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FP8BMM': True, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_FUSION_SHARED_EXPERTS': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_LINEAR': True, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MHA': True, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MLA': True, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_MOE': True, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_PAGED_ATTN': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_RMSNORM': True, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_GEMM': True, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_TRITON_ROPE': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_AITER_UNIFIED_ATTENTION': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_ROCM_USE_SKINNY_GEMM': True, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_RPC_TIMEOUT': 10000, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD': 256, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_SKIP_P2P_CHECK': True, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_SPARSE_INDEXER_MAX_LOGITS_MB': 512, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_SYSTEM_START_DATE': None, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_TARGET_DEVICE': 'cuda', -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_TEST_FORCE_FP8_MARLIN': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_TOOL_JSON_ERROR_AUTOMATIC_RETRY': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS': 1, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_TPU_BUCKET_PADDING_GAP': 0, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_TPU_MOST_MODEL_LEN': None, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_TPU_USING_PATHWAYS': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_TRACE_FUNCTION': 0, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USAGE_SOURCE': 'production-docker-image', -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_AOT_COMPILE': True, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_BYTECODE_HOOK': True, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM': True, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_E8M0': True, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_DEEP_GEMM_TMA_ALIGNED_SCALES': True, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_EXPERIMENTAL_PARSER_CONTEXT': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_FBGEMM': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP16': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP4': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_FP8': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_INT4': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_BF16': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_MOE_MXFP4_MXFP8_CUTLASS': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_FLASHINFER_SAMPLER': None, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_FUSED_MOE_GROUPED_TOPK': True, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_MEGA_AOT_ARTIFACT': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_NCCL_SYMM_MEM': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_NVFP4_CT_EMULATIONS': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_OINK_OPS': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_PRECOMPILED': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_CHANNEL_TYPE': 'auto', -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_RAY_COMPILED_DAG_OVERLAP_COMM': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_RAY_WRAPPED_PP_COMM': True, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_SIMPLE_KV_OFFLOAD': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_STANDALONE_COMPILE': True, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_TRITON_AWQ': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_USE_V2_MODEL_RUNNER': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_V1_USE_OUTLINES_CACHE': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_PIN_MEMORY': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_WEIGHT_OFFLOADING_DISABLE_UVA': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_XGRAMMAR_CACHE_MB': 512, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_XLA_CACHE_PATH': '/data/.cache/vllm/xla_cache', -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_XLA_CHECK_RECOMPILATION': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_XLA_USE_SPMD': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'VLLM_XPU_ENABLE_XPU_GRAPH': False, -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] 'V_SCALE_CONSTANT': 100} -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/backends.py:1074] Vllm config hash: dcef09dcfb -(EngineCore pid=244) INFO 04-22 00:21:28 [compilation/backends.py:1111] Dynamo bytecode transform time: 4.90 s -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/piecewise_backend.py:151] PiecewiseBackend: compile_ranges: [(1, 8192)] -(EngineCore pid=244) DEBUG 04-22 00:21:28 [compilation/piecewise_backend.py:155] PiecewiseBackend: compile_sizes: [] -(EngineCore pid=244) DEBUG 04-22 00:21:29 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=244) DEBUG 04-22 00:21:29 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.3 ms -(EngineCore pid=244) DEBUG 04-22 00:21:29 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 0.9 ms -(EngineCore pid=244) DEBUG 04-22 00:21:29 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=244) DEBUG 04-22 00:21:29 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(EngineCore pid=244) INFO 04-22 00:21:30 [compilation/backends.py:372] Cache the graph of compile range (1, 8192) for later use -(EngineCore pid=244) DEBUG 04-22 00:21:30 [compilation/backends.py:377] Store the 0-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_0', '/data/.cache/vllm/torch_compile_cache/e6437cffcc/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_0') -(EngineCore pid=244) DEBUG 04-22 00:21:31 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=244) DEBUG 04-22 00:21:31 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 0.5 ms -(EngineCore pid=244) DEBUG 04-22 00:21:31 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.5 ms -(EngineCore pid=244) DEBUG 04-22 00:21:31 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=244) DEBUG 04-22 00:21:31 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.2 ms -(APIServer pid=1) DEBUG 04-22 00:21:32 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=244) DEBUG 04-22 00:21:33 [compilation/backends.py:377] Store the 1-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_1', '/data/.cache/vllm/torch_compile_cache/e6437cffcc/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_1') -(EngineCore pid=244) DEBUG 04-22 00:21:34 [compilation/.../utility/noop_elimination.py:105] Removed 0 no-op reshapes and slices -(EngineCore pid=244) DEBUG 04-22 00:21:34 [compilation/passes/vllm_inductor_pass.py:79] NoOpEliminationPass completed in 1.1 ms -(EngineCore pid=244) DEBUG 04-22 00:21:34 [compilation/passes/vllm_inductor_pass.py:79] PostCleanupPass completed in 1.1 ms -(EngineCore pid=244) DEBUG 04-22 00:21:34 [compilation/.../utility/fix_functionalization.py:203] De-functionalized 0 nodes, removed 0 nodes -(EngineCore pid=244) DEBUG 04-22 00:21:34 [compilation/passes/vllm_inductor_pass.py:79] FixFunctionalizationPass completed in 0.5 ms -(EngineCore pid=244) DEBUG 04-22 00:21:34 [compilation/backends.py:377] Store the 28-th graph for compile range(1, 8192) from inductor_standalone via handle ('artifact_compile_range_1_8192_subgraph_28', '/data/.cache/vllm/torch_compile_cache/e6437cffcc/rank_0_0/backbone/artifact_compile_range_1_8192_subgraph_28') -(EngineCore pid=244) INFO 04-22 00:21:34 [compilation/backends.py:390] Compiling a graph for compile range (1, 8192) takes 6.31 s -(EngineCore pid=244) DEBUG 04-22 00:21:34 [compilation/backends.py:1228] Computation graph saved to /data/.cache/vllm/torch_compile_cache/e6437cffcc/rank_0_0/backbone/computation_graph.py -(EngineCore pid=244) INFO 04-22 00:21:35 [compilation/decorators.py:640] saved AOT compiled function to /data/.cache/vllm/torch_compile_cache/torch_aot_compile/2034ef402a31dd9ff23fe79a0a9c284842df5a9a5b69f98bb3d96953f0298379/rank_0_0/model -(EngineCore pid=244) INFO 04-22 00:21:35 [compilation/monitor.py:48] torch.compile took 12.49 s in total -(EngineCore pid=244) INFO 04-22 00:21:36 [compilation/monitor.py:76] Initial profiling/warmup run took 0.31 s -(EngineCore pid=244) INFO 04-22 00:21:41 [v1/core/kv_cache_utils.py:829] Overriding num_gpu_blocks=0 with num_gpu_blocks_override=512 -(EngineCore pid=244) DEBUG 04-22 00:21:41 [v1/worker/gpu_model_runner.py:5818] Initialized minimal KV cache for CUDA graph profiling -(EngineCore pid=244) INFO 04-22 00:21:41 [v1/worker/gpu_model_runner.py:5876] Profiling CUDA graph memory: PIECEWISE=51 (largest=512), FULL=51 (largest=512) -(EngineCore pid=244) DEBUG 04-22 00:21:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:21:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:21:42 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=512, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-22 00:21:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:21:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:21:42 [compilation/cuda_graph.py:271] Capturing a cudagraph on (PIECEWISE,BatchDescriptor(num_tokens=496, num_reqs=None, uniform=False, has_lora=False, num_active_loras=0)) -(APIServer pid=1) DEBUG 04-22 00:21:42 [v1/engine/utils.py:1047] Waiting for 1 local, 0 remote core engine proc(s) to start. -(EngineCore pid=244) DEBUG 04-22 00:21:42 [v1/worker/gpu_model_runner.py:5928] Estimated PIECEWISE CUDA graph memory: 128.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=244) DEBUG 04-22 00:21:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:21:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:21:42 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=512, num_reqs=512, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-22 00:21:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:21:42 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) DEBUG 04-22 00:21:42 [compilation/cuda_graph.py:271] Capturing a cudagraph on (FULL,BatchDescriptor(num_tokens=496, num_reqs=496, uniform=True, has_lora=False, num_active_loras=0)) -(EngineCore pid=244) DEBUG 04-22 00:21:42 [v1/worker/gpu_model_runner.py:5928] Estimated FULL CUDA graph memory: 228.00 MiB first-capture + (51-1) × 6.00 MiB per-graph -(EngineCore pid=244) DEBUG 04-22 00:21:42 [v1/worker/gpu_model_runner.py:5859] Cleaned up profiling KV cache and CUDA graphs -(EngineCore pid=244) INFO 04-22 00:21:42 [v1/worker/gpu_model_runner.py:5955] Estimated CUDA graph memory: 0.81 GiB total -(EngineCore pid=244) DEBUG 04-22 00:21:43 [v1/worker/gpu_worker.py:424] Initial free memory: 78.68 GiB; Requested memory: 0.950000 (util), 75.23 GiB -(EngineCore pid=244) DEBUG 04-22 00:21:43 [v1/worker/gpu_worker.py:430] Free memory after profiling: 69.79 GiB (total), 66.34 GiB (within requested) -(EngineCore pid=244) DEBUG 04-22 00:21:43 [v1/worker/gpu_worker.py:435] Memory profiling takes 19.77 seconds. Total non KV cache memory: 10.59GiB; torch peak memory increase: 2.21GiB; non-torch forward increase memory: 0.25GiB; weights memory: 8.14GiB. -(EngineCore pid=244) INFO 04-22 00:21:43 [v1/worker/gpu_worker.py:436] Available KV cache memory: 64.64 GiB -(EngineCore pid=244) INFO 04-22 00:21:43 [v1/worker/gpu_worker.py:470] In v0.19, CUDA graph memory profiling will be enabled by default (VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1), which more accurately accounts for CUDA graph memory during KV cache allocation. To try it now, set VLLM_MEMORY_PROFILER_ESTIMATE_CUDAGRAPHS=1 and increase --gpu-memory-utilization from 0.9500 to 0.9602 to maintain the same effective KV cache size. -(EngineCore pid=244) INFO 04-22 00:21:43 [v1/core/kv_cache_utils.py:1319] GPU KV cache size: 1,210,272 tokens -(EngineCore pid=244) INFO 04-22 00:21:43 [v1/core/kv_cache_utils.py:1324] Maximum concurrency for 8,192 tokens per request: 147.74x -(EngineCore pid=244) 2026-04-22 00:21:43,131 - INFO - autotuner.py:262 - flashinfer.jit: [Autotuner]: Autotuning process starts ... -(EngineCore pid=244) DEBUG 04-22 00:21:43 [v1/worker/gpu_model_runner.py:5342] ubatch_slices: None, ubatch_slices_padded: None -(EngineCore pid=244) 2026-04-22 00:21:43,141 - INFO - autotuner.py:268 - flashinfer.jit: [Autotuner]: Autotuning process ends -(EngineCore pid=244) Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 0%| | 0/51 [00:00 Date: Fri, 24 Apr 2026 14:38:35 -0400 Subject: [PATCH 15/24] Update readme instructions: Signed-off-by: Jing Chen --- accuracy/.gitignore | 1 + accuracy/README.md | 46 ++++++++++++++++++++++----------------------- 2 files changed, 24 insertions(+), 23 deletions(-) create mode 100644 accuracy/.gitignore diff --git a/accuracy/.gitignore b/accuracy/.gitignore new file mode 100644 index 00000000..fbca2253 --- /dev/null +++ b/accuracy/.gitignore @@ -0,0 +1 @@ +results/ diff --git a/accuracy/README.md b/accuracy/README.md index e6aaaf3c..9615d44b 100644 --- a/accuracy/README.md +++ b/accuracy/README.md @@ -113,24 +113,22 @@ on resubmit. Pull logs and JSON results from the cluster PVC to your local machine: ```bash -python accuracy/scripts/collect.py -# Results land in: data/benchmarks/memory/v0.19.0/runs/ and .../logs/ +uv run python accuracy/scripts/collect.py --out accuracy/results/ +# Results land in: accuracy/results/v0.19.0/runs/ and .../logs/ ``` -Copy the new run JSONs into `accuracy/results/v0.19.0/runs/` (this directory is gitignored), then generate the report. -`analyze.py` calls the capacity planner directly to compute predictions — no separate -calibration step needed. For gated models pass `--hf-token ` (only fetches -`config.json`, not model weights): +Then run the three-step pipeline to generate the report: ```bash -python accuracy/scripts/analyze.py \ - --runs accuracy/results/v0.19.0/runs/ \ - --out accuracy/results/v0.19.0/report.md \ - --csv accuracy/results/v0.19.0/results.csv - -python accuracy/scripts/deep_analysis.py \ - --csv accuracy/results/v0.19.0/results.csv \ - --out accuracy/results/v0.19.0/deep_analysis.md +# Step 1: parse vLLM startup logs → results_raw.csv +uv run python accuracy/scripts/parse_logs.py + +# Step 2: run capacity planner predictions → results_predicted.csv +# Gated models (google/gemma-*) require an HF token that has been granted access: +HF_TOKEN=hf_YOUR_TOKEN_HERE uv run python accuracy/scripts/predict_capacity.py + +# Step 3: compute error statistics → accuracy/results/v0.19.0/accuracy_report.md +uv run python accuracy/scripts/analyze.py ``` ## Reproducing from existing results (no cluster needed) @@ -140,17 +138,19 @@ Download the `results/` folder from Google Drive and place it at `accuracy/resul **[Download results/ from Google Drive](https://drive.google.com/drive/folders/1a0y2gdhcpKcFxm4RsqXUKWW40Gpd2Kx5?usp=sharing)** -Once downloaded, regenerate the report and analysis locally: +Once downloaded, place the `results/` folder at `accuracy/results/` and regenerate the +report locally (no cluster or HF token needed — the Drive folder includes pre-computed CSVs): ```bash -uv run python accuracy/scripts/analyze.py \ - --runs accuracy/results/v0.19.0/runs/ \ - --out accuracy/results/v0.19.0/report.md \ - --csv accuracy/results/v0.19.0/results.csv - -uv run python accuracy/scripts/deep_analysis.py \ - --csv accuracy/results/v0.19.0/results.csv \ - --out accuracy/results/v0.19.0/deep_analysis.md +# Re-parse logs into results_raw.csv (optional — Drive copy is already up to date) +uv run python accuracy/scripts/parse_logs.py + +# Regenerate predictions (optional — requires HF_TOKEN for gated Gemma models) +HF_TOKEN=hf_YOUR_TOKEN_HERE uv run python accuracy/scripts/predict_capacity.py + +# Re-generate the accuracy report from the CSVs +uv run python accuracy/scripts/analyze.py +# Output: accuracy/results/v0.19.0/accuracy_report.md ``` ## Troubleshooting From f0197dd6d3b4a1b8965c12526a4a867af0edfa98 Mon Sep 17 00:00:00 2001 From: Jing Chen Date: Sun, 26 Apr 2026 13:26:15 -0400 Subject: [PATCH 16/24] Address comments Signed-off-by: Jing Chen --- accuracy/blog-gpu-capacity.md | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/accuracy/blog-gpu-capacity.md b/accuracy/blog-gpu-capacity.md index 984b070c..dd5a15c3 100644 --- a/accuracy/blog-gpu-capacity.md +++ b/accuracy/blog-gpu-capacity.md @@ -32,7 +32,7 @@ Claiming a tool is accurate is easy. Measuring it is harder. We launched vLLM se - **Dtype and quantization variants**: bfloat16, float16; compressed-tensors, GPTQ - **vLLM version sensitivity**: Qwen3-14B across v0.15.0 through v0.19.0 to track how memory behavior changes across releases -For each run, we compared predicted values against the four measured memory components independently. The raw results and all run JSON files are committed to the repository, and the analysis is fully reproducible locally without cluster access. +For each run, we compared predicted values against the four measured memory components independently. The [raw results and all run JSON files](https://github.com/llm-d-incubation/llm-d-planner/tree/main/accuracy/results) are committed to the repository, and the analysis is fully reproducible locally without cluster access. --- @@ -46,7 +46,7 @@ Weight prediction is harder than it looks: dense, MoE, multi-head latent attenti **KV cache memory: 0.34% mean error** across all runs. This is the component that matters most for capacity planning, as it determines your maximum concurrent token budget. For Llama-3.1-8B at TP=1 with 8K context, that's roughly 58 GiB of KV pool, and we're within half a GiB across every context length we tested. -One insight worth pausing on: the KV pool size is *independent* of `max_model_len`. vLLM sizes the pool from whatever memory remains after weights and activation are allocated, then figures out how many tokens fit given the per-token KV size for that architecture. This means setting a longer context window doesn't shrink your KV pool; it just means each token occupies more of it. Tools that pre-allocate based on `max_model_len` will over-estimate memory for long-context configs and leave capacity on the table. +One insight worth pausing on: the KV pool size is *independent* of `max_model_len`. vLLM sizes the pool from whatever memory remains after weights and activation are allocated, then figures out how many tokens fit given the per-token KV size for that architecture. This means setting a longer context window doesn't shrink your KV pool; it just means each request can use a larger share of it, leaving less headroom for concurrent requests at the maximum context length. Tools that pre-allocate based on `max_model_len` will over-estimate memory for long-context configs and leave capacity on the table. These two components together typically account for 90%+ of total GPU memory consumption. Getting them right is what makes the planner useful in practice. @@ -70,9 +70,11 @@ The planner's Qwen3 activation constant was 5.60 GiB, a near-exact match for v0. This kind of silent drift is precisely why empirical validation matters. We didn't catch it until we ran the experiments, and the fix was straightforward once we knew where to look: re-calibrate every architecture constant against v0.19.0 measurements. That's now done, and the updated constants are in the library. +The planner currently tracks the behavior of the latest supported vLLM release (v0.19.0). It is not version-aware in the sense that it won't automatically adjust for older releases. When vLLM changes memory behavior in a future release, re-running the accuracy sweep and submitting a PR with updated constants is how the library stays current—which is exactly the kind of contribution the community campaign is designed to support. + **Non-torch overhead** (CUDA runtime + NCCL buffers) was under-estimated by 44% on average. At TP=1, this is a small absolute amount (~0.25 GiB actual vs 0.15 GiB predicted). At TP>=2, NCCL all-reduce buffers push actual overhead to ~2.1 GiB per GPU versus our constant of 0.60 GiB, a more meaningful gap. Updated multi-GPU constants are also in. -There are a few configurations the experiment didn't cover that the planner doesn't yet model: fp8 KV cache dtype (halves per-token storage, roughly doubling token capacity), float32 dtype overrides (doubles weight memory), runtime fp8 quantization, and data parallelism. These are real gaps for anyone running quantized production models today, and they're actively being worked on — contributions are welcome if you need one of these sooner. The sweep also turned up a subtle correctness bug in `find_possible_tp`: it wasn't verifying that TP values divide `vocab_size`, which can cause vLLM to reject a configuration the planner suggests as valid. That's fixed. +There are a few configurations the experiment didn't cover that the planner doesn't yet model: fp8 KV cache dtype (halves per-token storage, roughly doubling token capacity), float32 dtype overrides (doubles weight memory), runtime fp8 quantization, and data parallelism. For entirely unknown precision types, the planner raises an error. For these specific gaps, the planner will produce an estimate using the base model configuration without accounting for the override—meaning results may be off for that component without an explicit warning. These are real gaps for anyone running quantized production models today, and they're actively being worked on — contributions are welcome if you need one of these sooner. The sweep also turned up a subtle correctness bug in `find_possible_tp`: it wasn't verifying that TP values divide `vocab_size`, which can cause vLLM to reject a configuration the planner suggests as valid. That's fixed. --- From 875a4fd609eef0d70810ac27174ca9b42156d13a Mon Sep 17 00:00:00 2001 From: Jing Chen Date: Tue, 28 Apr 2026 19:03:22 -0400 Subject: [PATCH 17/24] update with mimo results Signed-off-by: Jing Chen --- accuracy/accuracy_report.md | 21 +++--- accuracy/blog-gpu-capacity.md | 16 ++--- .../scripts/sweep-new-models-2026-04.yaml | 65 +++++++++++++++++++ accuracy/scripts/sweep.yaml | 4 ++ 4 files changed, 89 insertions(+), 17 deletions(-) create mode 100644 accuracy/scripts/sweep-new-models-2026-04.yaml diff --git a/accuracy/accuracy_report.md b/accuracy/accuracy_report.md index 2e7e0af0..49e494d8 100644 --- a/accuracy/accuracy_report.md +++ b/accuracy/accuracy_report.md @@ -10,18 +10,18 @@ ## Part 1: Accuracy Evaluation — vLLM v0.19.0 -Covers 53 runs across 34 models using only parameters the planner currently accepts as inputs. Excludes runs with `--dtype float32`, runtime `--quantization fp8`, and `--kv-cache-dtype fp8` (see Part 3). +Covers 54 runs across 35 models using only parameters the planner currently accepts as inputs. Excludes runs with `--dtype float32`, runtime `--quantization fp8`, and `--kv-cache-dtype fp8` (see Part 3). ### Summary | Metric | Mean error | Mean abs error | n | |--------|:----------:|:--------------:|:-:| -| KV cache memory (all runs) | +0.34% | +6.62% | 52 | -| KV cache memory (baseline: tp=pp=dp=1, len=8192, no quant) | -5.12% | — | 21 | -| Weight memory | -0.89% | +0.89% | 52 | -| Activation memory | +195.12% | +195.12% | 52 | -| Non-torch overhead | -44.08% | — | 52 | -| Max concurrency | +3.34% | +15.34% | 52 | +| KV cache memory (all runs) | +0.34% | +6.62% | 53 | +| KV cache memory (baseline: tp=pp=dp=1, len=8192, no quant) | -5.12% | — | 22 | +| Weight memory | -0.89% | +0.89% | 53 | +| Activation memory | +195.12% | +195.12% | 53 | +| Non-torch overhead | -44.08% | — | 53 | +| Max concurrency | +3.34% | +15.34% | 53 | **Key findings**: @@ -55,6 +55,7 @@ Covers 53 runs across 34 models using only parameters the planner currently acce | Mistral-Small-3.1-24B-Instruct-2503 | Mistral3* | -0.08% | +23.15% | -40.00% | +1.54% | +1.55% | | Qwen1.5-MoE-A2.7B | Qwen2Moe | -0.02% | +223.89% | -40.00% | -10.14% | -10.12% | | Kimi-VL-A3B-Instruct | KimiVL* | -0.58% | +173.97% | -40.00% | -9.76% | -87.31% | +| MiMo-VL-7B-SFT | Qwen2_5_VL* | -0.82% | +120.88% | -40.00% | -3.54% | -3.54% | ### Sensitivity: Tensor Parallelism (TP) @@ -126,6 +127,7 @@ The planner uses fixed constants per architecture calibrated against vLLM v0.16. | Phi | 5.50 | 0.79 | +596.20% | | Qwen2 | 5.60 | 2.21–2.29 | +144.54% to +153.39% | | Qwen3 | 5.60 | 2.21 | +153.39% | +| Qwen2_5_VL* | 5.50 | 2.49 | +120.88% | | Qwen2Moe | 8.00 | 2.47 | +223.89% | | Qwen3Moe | 8.00 | 2.68 | +198.51% | @@ -243,11 +245,11 @@ Runtime `--quantization fp8` compresses weights on-the-fly. vLLM logs the post-c ## Run Matrix — vLLM v0.19.0 / H100-80GB -**57 successful runs, 7 failed runs.** +**58 successful runs, 7 failed runs.** Quantization abbreviations: `ct` = compressed-tensors, `gptq` = gptq_marlin, `fp8` = fp8 inline, `mxfp4` = mx-format fp4, `—` = none. -Vision/multi-modal models in this sweep: `moonshotai/Kimi-VL-A3B-Instruct` (vision-language MoE), `ibm-granite/granite-vision-3.3-2b` (vision-language), `google/gemma-3-{4b,12b,27b}-it` (vision-language), `meta-llama/Llama-4-Scout-17B-16E-Instruct` (vision+text MoE). +Vision/multi-modal models in this sweep: `moonshotai/Kimi-VL-A3B-Instruct` (vision-language MoE), `ibm-granite/granite-vision-3.3-2b` (vision-language), `google/gemma-3-{4b,12b,27b}-it` (vision-language), `meta-llama/Llama-4-Scout-17B-16E-Instruct` (vision+text MoE), `XiaomiMiMo/MiMo-VL-7B-SFT` (vision-language). `Qwen/Qwen1.5-MoE-A2.7B` uses the Qwen2Moe architecture (14.3B total, 2.7B active). Its activation memory (2.47 GiB) is much lower than the generic MoE constant (8.0 GiB) used by the planner, similar to the pattern observed for Qwen3Moe, Mixtral, and Llama4. @@ -312,6 +314,7 @@ Vision/multi-modal models in this sweep: `moonshotai/Kimi-VL-A3B-Instruct` (visi | RedHatAI/Mistral-Small-3.1-24B-Instruct-2503-quantized.w8a8 | 2 | 1 | 1 | 8192 | bf16 | ct | auto | -0.8% | +23.2% | -71.0% | +5.3% | | RedHatAI/Qwen2.5-7B-Instruct-fp8-dynamic | 1 | 1 | 1 | 8192 | bf16 | ct | auto | -0.4% | +153.4% | -37.5% | -3.9% | | RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8 | 1 | 1 | 1 | 8192 | bf16 | ct | auto | -0.4% | +153.4% | -40.0% | -3.9% | +| XiaomiMiMo/MiMo-VL-7B-SFT | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.8% | +120.9% | -40.0% | -3.5% | ### Failed Runs diff --git a/accuracy/blog-gpu-capacity.md b/accuracy/blog-gpu-capacity.md index dd5a15c3..8fd2b705 100644 --- a/accuracy/blog-gpu-capacity.md +++ b/accuracy/blog-gpu-capacity.md @@ -1,6 +1,6 @@ -# 57 Experiments Later: What We Learned About LLM Memory Prediction +# 58 Experiments Later: What We Learned About LLM Memory Prediction -*GPU memory planning for LLM deployments is still mostly guesswork. Here's what we learned from measuring it empirically across 34 architectures.* +*GPU memory planning for LLM deployments is still mostly guesswork. Here's what we learned from measuring it empirically across 35 architectures.* --- @@ -10,7 +10,7 @@ In all of these cases, the question is the same: **how much GPU memory will this Most teams answer it by copying what someone else deployed, or by spinning up the pod, watching it OOM, and doubling the resources. This works, but it gets harder as models grow larger and serving configurations more complex. Tensor parallelism, pipeline parallelism, quantization, and long-context windows all change the memory footprint in non-obvious ways. -[llm-d-planner](https://github.com/llm-d-incubation/llm-d-planner) is an open-source library that guides LLM deployments from concept to production. One of its core submodules is a capacity planner built to answer this question before you deploy. To make sure it wasn't just replacing guesswork with a false sense of precision, we ran 57 experiments on H100 GPUs to validate its predictions against reality. Here's what we found, and why we're asking the community to help make it even better. +[llm-d-planner](https://github.com/llm-d-incubation/llm-d-planner) is an open-source library that guides LLM deployments from concept to production. One of its core submodules is a capacity planner built to answer this question before you deploy. To make sure it wasn't just replacing guesswork with a false sense of precision, we ran 58 experiments on H100 GPUs to validate its predictions against reality. Here's what we found, and why we're asking the community to help make it even better. --- @@ -24,9 +24,9 @@ It breaks memory into four components: weights, KV cache, activation memory, and ## The Experiment: Trusting but Verifying -Claiming a tool is accurate is easy. Measuring it is harder. We launched vLLM servers across 57 configurations on H100-80GB GPUs, captured the full startup logs for each, and parsed the actual memory measurements reported by vLLM at initialization. We then compared those measurements against llm-d-planner's predictions for every configuration. The sweep covered: +Claiming a tool is accurate is easy. Measuring it is harder. We launched vLLM servers across 58 configurations on H100-80GB GPUs, captured the full startup logs for each, and parsed the actual memory measurements reported by vLLM at initialization. We then compared those measurements against llm-d-planner's predictions for every configuration. The sweep covered: -- **34 model architectures**: Llama, Qwen, Gemma, Granite, Mistral, DeepSeek, Phi, Mixtral, and multimodal models including LLaVA and Kimi-VL +- **35 model architectures**: Llama, Qwen, Gemma, Granite, Mistral, DeepSeek, Phi, Mixtral, and multimodal models including LLaVA, Kimi-VL, and MiMo - **Tensor parallelism** (TP 1, 2, 4) and **pipeline parallelism** (PP 1, 2, 4) - **Context lengths** from 2,048 to 32,768 tokens - **Dtype and quantization variants**: bfloat16, float16; compressed-tensors, GPTQ @@ -40,9 +40,9 @@ For each run, we compared predicted values against the four measured memory comp ### The headline: accurate where it counts most -**Weight memory: 0.89% mean absolute error** across 53 of the 57 runs. (The remaining 4 used parameters the planner doesn't yet model, float32 dtype and runtime fp8 quantization, and are discussed below.) This is the single largest memory component; for a model like Llama-3.1-8B at TP=1, weights consume about 15 GiB of the 79 GiB available. It's also the hardest to get right across a diverse model set. +**Weight memory: 0.89% mean absolute error** across 54 of the 58 runs. (The remaining 4 used parameters the planner doesn't yet model, float32 dtype and runtime fp8 quantization, and are discussed below.) This is the single largest memory component; for a model like Llama-3.1-8B at TP=1, weights consume about 15 GiB of the 79 GiB available. It's also the hardest to get right across a diverse model set. -Weight prediction is harder than it looks: dense, MoE, multi-head latent attention, and vision-language models all organize parameters differently, quantization changes the bytes-per-parameter, and TP sharding depends on how dimensions divide across ranks. The formula handles all of this by reading `config.json` for architecture parameters and safetensor headers for exact tensor shapes, giving precise counts without downloading the full model and making it generalizable to any model on HuggingFace beyond the 34 we explicitly tested. Across dense, MoE, multimodal, and quantized architectures, it held to under 1% error. +Weight prediction is harder than it looks: dense, MoE, multi-head latent attention, and vision-language models all organize parameters differently, quantization changes the bytes-per-parameter, and TP sharding depends on how dimensions divide across ranks. The formula handles all of this by reading `config.json` for architecture parameters and safetensor headers for exact tensor shapes, giving precise counts without downloading the full model and making it generalizable to any model on HuggingFace beyond the 35 we explicitly tested. Across dense, MoE, multimodal, and quantized architectures, it held to under 1% error. **KV cache memory: 0.34% mean error** across all runs. This is the component that matters most for capacity planning, as it determines your maximum concurrent token budget. For Llama-3.1-8B at TP=1 with 8K context, that's roughly 58 GiB of KV pool, and we're within half a GiB across every context length we tested. @@ -80,7 +80,7 @@ There are a few configurations the experiment didn't cover that the planner does ## Join the Community -We covered 34 architectures. The LLM landscape releases more every week, and vLLM will keep evolving. Accuracy at a point in time isn't enough; what matters is having a community that keeps the constants current as things change. +We covered 35 architectures. The LLM landscape releases more every week, and vLLM will keep evolving. Accuracy at a point in time isn't enough; what matters is having a community that keeps the constants current as things change. **If your model isn't covered, or a new architecture ships with memory optimizations** (a new attention variant, a custom KV cache layout, or a novel quantization scheme), llm-d-planner should be where those updated constants land first. The sweep runner in `accuracy/` is fully documented and self-contained; run it against your own cluster, submit the results as a PR, and everyone who installs the library gets the improvement. diff --git a/accuracy/scripts/sweep-new-models-2026-04.yaml b/accuracy/scripts/sweep-new-models-2026-04.yaml new file mode 100644 index 00000000..f7614d6d --- /dev/null +++ b/accuracy/scripts/sweep-new-models-2026-04.yaml @@ -0,0 +1,65 @@ +# New model sweep — April 2026 / vLLM v0.19.0 / H100-80GB +# 8 models: zai-org (GLM-5, GLM-5-FP8, GLM-OCR, GLM-4.5), XiaomiMiMo (MiMo-V2-Flash, MiMo-VL-7B-SFT, MiMo-V2.5), moonshotai (Kimi-K2.5) +# All large MoE models use fp8 inline quantization; gmu=0.85 where HF recommends it. +# Kimi-K2.5 (tp=8, 1T MoE): fp8 still ~125 GiB/GPU — OOM expected, pending resource availability. +# GLM-5/FP8 (tp=8): fp8 ~93 GiB/GPU — may still OOM; startup logs will tell. +# +# Edit defaults.node_selector to match your cluster's GPU node label. + +defaults: + gpu: H100-80GB + gpu_memory_utilization: "0.95" + max_model_len: 8192 + pp: 1 + dp: 1 + dtype: auto + kv_cache_dtype: auto + quantization: null + vllm_image: vllm/vllm-openai:v0.19.0 + namespace: llmdplanner + results_pvc: vllm-mem-data + node_selector: + nvidia.com/gpu.product: NVIDIA-H100-80GB-HBM3 + +runs: + # ── ZhipuAI / zai-org models ────────────────────────────────────────────── + # tp=8 — skipped: cluster fully allocated, no 8-GPU node available + # - model: zai-org/GLM-5 # 744B total, 40B active MoE; fp8 → ~93 GiB/GPU @ tp=8; may still OOM (weights > 80 GiB) + # tp: 8 + # gpu_memory_utilization: "0.85" + # quantization: fp8 + # trust_remote_code: true + + # - model: zai-org/GLM-5-FP8 # 754B total, 40B active MoE; pre-quantized FP8 (auto-detected) + # tp: 8 + # gpu_memory_utilization: "0.85" + # trust_remote_code: true + + - model: zai-org/GLM-OCR # 0.9B dense vision-language OCR; CogViT + GLM-0.5B decoder + tp: 1 + trust_remote_code: true + + # - model: zai-org/GLM-4.5 # 355B total, 32B active MoE; fp8 → ~44 GiB/GPU @ tp=8; should fit + # tp: 8 + # quantization: fp8 + # trust_remote_code: true + + # ── XiaomiMiMo models ───────────────────────────────────────────────────── + - model: XiaomiMiMo/MiMo-V2-Flash # 309B total, 15B active MoE; fp8 → ~77 GiB/GPU @ tp=4; minimal KV cache expected + tp: 4 + quantization: fp8 + trust_remote_code: true + + - model: XiaomiMiMo/MiMo-VL-7B-SFT # 7B dense vision-language; tp=1, no quantization needed + tp: 1 + trust_remote_code: true + + - model: XiaomiMiMo/MiMo-V2.5 # 310B total, 15B active MoE; fp8 → ~77 GiB/GPU @ tp=4; minimal KV cache expected + tp: 4 + quantization: fp8 + trust_remote_code: true + + # ── moonshotai ──────────────────────────────────────────────────────────── + # - model: moonshotai/Kimi-K2.5 # 1T total, 32B active MoE; fp8 still ~125 GiB/GPU @ tp=8; OOM expected (pending) + # tp: 8 + # trust_remote_code: true diff --git a/accuracy/scripts/sweep.yaml b/accuracy/scripts/sweep.yaml index 64570531..2d25af75 100644 --- a/accuracy/scripts/sweep.yaml +++ b/accuracy/scripts/sweep.yaml @@ -232,3 +232,7 @@ runs: - model: RedHatAI/Qwen2.5-7B-Instruct-quantized.w8a8 # w8a8 weights; compressed-tensors tp: 1 _label: w8a8-redhatai-qwen2-5-7b + + - model: XiaomiMiMo/MiMo-VL-7B-SFT # 7B dense vision-language; tp=1 + tp: 1 + trust_remote_code: true From 3ecd91c9ae874de16fdee3f2a3320dd4afdc75cf Mon Sep 17 00:00:00 2001 From: Jing Chen Date: Wed, 29 Apr 2026 12:06:34 -0400 Subject: [PATCH 18/24] Refinements Signed-off-by: Jing Chen --- accuracy/blog-gpu-capacity.md | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/accuracy/blog-gpu-capacity.md b/accuracy/blog-gpu-capacity.md index 8fd2b705..2fdabcaa 100644 --- a/accuracy/blog-gpu-capacity.md +++ b/accuracy/blog-gpu-capacity.md @@ -1,4 +1,4 @@ -# 58 Experiments Later: What We Learned About LLM Memory Prediction +# 60 Experiments Later: What We Learned About LLM Memory Prediction *GPU memory planning for LLM deployments is still mostly guesswork. Here's what we learned from measuring it empirically across 35 architectures.* @@ -10,7 +10,7 @@ In all of these cases, the question is the same: **how much GPU memory will this Most teams answer it by copying what someone else deployed, or by spinning up the pod, watching it OOM, and doubling the resources. This works, but it gets harder as models grow larger and serving configurations more complex. Tensor parallelism, pipeline parallelism, quantization, and long-context windows all change the memory footprint in non-obvious ways. -[llm-d-planner](https://github.com/llm-d-incubation/llm-d-planner) is an open-source library that guides LLM deployments from concept to production. One of its core submodules is a capacity planner built to answer this question before you deploy. To make sure it wasn't just replacing guesswork with a false sense of precision, we ran 58 experiments on H100 GPUs to validate its predictions against reality. Here's what we found, and why we're asking the community to help make it even better. +[llm-d-planner](https://github.com/llm-d-incubation/llm-d-planner) is an open-source library that guides LLM deployments from concept to production. One of its core submodules is a capacity planner built to answer this question before you deploy. To make sure it wasn't just replacing guesswork with a false sense of precision, we ran 60 experiments on H100 GPUs to validate its predictions against reality. Here's what we found, and why we're asking the community to help make it even better. --- @@ -20,11 +20,13 @@ llm-d-planner guides LLM deployments from concept to production: conversational It breaks memory into four components: weights, KV cache, activation memory, and non-torch overhead (CUDA runtime and NCCL buffers for multi-GPU). Each scales differently with tensor parallelism, context length, and quantization, so knowing which component is driving your footprint tells you what to actually change. For each component, the planner anchors to a source of truth wherever one exists: `config.json` and safetensor file headers for weights, vLLM's allocation strategy for KV cache, and empirically measured constants for things that can't be derived analytically, like activation memory. The experiment in this post is how those constants are kept honest. +Accurate memory prediction is also the prerequisite for any infrastructure decision: choosing a deployment topology, sizing a cluster, or selecting between serving strategies. For users of [llm-d](https://llm-d.ai), for example, it's the step that narrows down which well-lit path is appropriate for a given use case before any hardware is provisioned. + --- ## The Experiment: Trusting but Verifying -Claiming a tool is accurate is easy. Measuring it is harder. We launched vLLM servers across 58 configurations on H100-80GB GPUs, captured the full startup logs for each, and parsed the actual memory measurements reported by vLLM at initialization. We then compared those measurements against llm-d-planner's predictions for every configuration. The sweep covered: +Claiming a tool is accurate is easy. Measuring it is harder. We launched vLLM servers across 60 configurations on H100-80GB GPUs, captured the full startup logs for each, and parsed the actual memory measurements reported by vLLM at initialization. We then compared those measurements against llm-d-planner's predictions for every configuration. The sweep covered: - **35 model architectures**: Llama, Qwen, Gemma, Granite, Mistral, DeepSeek, Phi, Mixtral, and multimodal models including LLaVA, Kimi-VL, and MiMo - **Tensor parallelism** (TP 1, 2, 4) and **pipeline parallelism** (PP 1, 2, 4) @@ -32,7 +34,7 @@ Claiming a tool is accurate is easy. Measuring it is harder. We launched vLLM se - **Dtype and quantization variants**: bfloat16, float16; compressed-tensors, GPTQ - **vLLM version sensitivity**: Qwen3-14B across v0.15.0 through v0.19.0 to track how memory behavior changes across releases -For each run, we compared predicted values against the four measured memory components independently. The [raw results and all run JSON files](https://github.com/llm-d-incubation/llm-d-planner/tree/main/accuracy/results) are committed to the repository, and the analysis is fully reproducible locally without cluster access. +For each run, we compared predicted values against the four measured memory components independently. The [raw logs and run JSON files](https://drive.google.com/drive/folders/1a0y2gdhcpKcFxm4RsqXUKWW40Gpd2Kx5) are published for reference, and the analysis is fully reproducible locally without cluster access. --- @@ -40,11 +42,11 @@ For each run, we compared predicted values against the four measured memory comp ### The headline: accurate where it counts most -**Weight memory: 0.89% mean absolute error** across 54 of the 58 runs. (The remaining 4 used parameters the planner doesn't yet model, float32 dtype and runtime fp8 quantization, and are discussed below.) This is the single largest memory component; for a model like Llama-3.1-8B at TP=1, weights consume about 15 GiB of the 79 GiB available. It's also the hardest to get right across a diverse model set. +**Weight memory: 0.84% mean absolute error** across 49 of the 60 runs. (The remaining 11 used parameters the planner doesn't yet model—float32 dtype, runtime fp8 quantization, and fp8 KV cache dtype—and are discussed below.) This is the single largest memory component; for a model like Llama-3.1-8B at TP=1, weights consume about 15 GiB of the 79 GiB available. It's also the hardest to get right across a diverse model set. Weight prediction is harder than it looks: dense, MoE, multi-head latent attention, and vision-language models all organize parameters differently, quantization changes the bytes-per-parameter, and TP sharding depends on how dimensions divide across ranks. The formula handles all of this by reading `config.json` for architecture parameters and safetensor headers for exact tensor shapes, giving precise counts without downloading the full model and making it generalizable to any model on HuggingFace beyond the 35 we explicitly tested. Across dense, MoE, multimodal, and quantized architectures, it held to under 1% error. -**KV cache memory: 0.34% mean error** across all runs. This is the component that matters most for capacity planning, as it determines your maximum concurrent token budget. For Llama-3.1-8B at TP=1 with 8K context, that's roughly 58 GiB of KV pool, and we're within half a GiB across every context length we tested. +**KV cache memory: 0.89% mean error** across all runs, and −6.96% at baseline (TP=PP=DP=1, 8K context, no quantization). This is the component that matters most for capacity planning, as it determines your maximum concurrent token budget. For Llama-3.1-8B at TP=1 with 8K context, that's roughly 58 GiB of KV pool, and we're within a few percent across every context length we tested. One insight worth pausing on: the KV pool size is *independent* of `max_model_len`. vLLM sizes the pool from whatever memory remains after weights and activation are allocated, then figures out how many tokens fit given the per-token KV size for that architecture. This means setting a longer context window doesn't shrink your KV pool; it just means each request can use a larger share of it, leaving less headroom for concurrent requests at the maximum context length. Tools that pre-allocate based on `max_model_len` will over-estimate memory for long-context configs and leave capacity on the table. @@ -52,7 +54,7 @@ These two components together typically account for 90%+ of total GPU memory con ### The honest part: smaller components, real errors -**Activation memory** showed a mean error of +195%. That sounds alarming, so let's ground it in absolute numbers. For Llama-3.1-8B at TP=1, our formula predicted 4.80 GiB; vLLM v0.19.0 actually used 1.89 GiB, an over-estimate of about 2.9 GiB. On a GPU with 79 GiB of VRAM where weights alone consume 15 GiB and the KV pool takes 58 GiB, a 2.9 GiB error in a smaller component is meaningful but bounded. +**Activation memory** showed a mean error of +212.88%. That sounds alarming, so let's ground it in absolute numbers. For Llama-3.1-8B at TP=1, our formula predicted 4.80 GiB; vLLM v0.19.0 actually used 1.89 GiB, an over-estimate of about 2.9 GiB. On a GPU with 79 GiB of VRAM where weights alone consume 15 GiB and the KV pool takes 58 GiB, a 2.9 GiB error in a smaller component is meaningful but bounded. The root cause is more interesting than the magnitude: **vLLM v0.17.0 quietly reduced activation memory by ~60%, and we didn't notice.** From f3265ef3200cc3b1c77f9ff84702438152eceed6 Mon Sep 17 00:00:00 2001 From: Jing Chen Date: Wed, 29 Apr 2026 12:19:48 -0400 Subject: [PATCH 19/24] Fix numbers Signed-off-by: Jing Chen --- accuracy/accuracy_report.md | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/accuracy/accuracy_report.md b/accuracy/accuracy_report.md index 49e494d8..dc939f7d 100644 --- a/accuracy/accuracy_report.md +++ b/accuracy/accuracy_report.md @@ -10,25 +10,25 @@ ## Part 1: Accuracy Evaluation — vLLM v0.19.0 -Covers 54 runs across 35 models using only parameters the planner currently accepts as inputs. Excludes runs with `--dtype float32`, runtime `--quantization fp8`, and `--kv-cache-dtype fp8` (see Part 3). +Covers 49 runs across 28 models using only parameters the planner currently accepts as inputs. Excludes runs with `--dtype float32`, runtime `--quantization fp8`, and `--kv-cache-dtype fp8` (see Part 3). Additional models appear in the per-model and run matrix tables below but are not included in aggregate statistics because predictions have not yet been generated for them. ### Summary | Metric | Mean error | Mean abs error | n | |--------|:----------:|:--------------:|:-:| -| KV cache memory (all runs) | +0.34% | +6.62% | 53 | -| KV cache memory (baseline: tp=pp=dp=1, len=8192, no quant) | -5.12% | — | 22 | -| Weight memory | -0.89% | +0.89% | 53 | -| Activation memory | +195.12% | +195.12% | 53 | -| Non-torch overhead | -44.08% | — | 53 | -| Max concurrency | +3.34% | +15.34% | 53 | +| KV cache memory (all runs) | +0.89% | +7.82% | 49 | +| KV cache memory (baseline: tp=pp=dp=1, len=8192, no quant) | -6.96% | — | 15 | +| Weight memory | -0.84% | +0.84% | 49 | +| Activation memory | +212.88% | +212.88% | 49 | +| Non-torch overhead | -44.81% | — | 49 | +| Max concurrency | +3.68% | +16.65% | 49 | **Key findings**: -- **Weight memory is accurate**: mean abs error +0.89%, computed directly from safetensors parameter counts. -- **KV cache memory is close**: +0.34% mean error across all runs; -5.12% at baseline. Errors are small and consistent. -- **Activation is the dominant error source**: mean +195.12% (over-estimate). The planner uses empirical constants calibrated against vLLM v0.16.0 or earlier; v0.17.0 introduced a ~60% reduction in reported activation overhead that persists through v0.19.0. See Part 2. -- **Max concurrency tracks KV accuracy**: +3.34% mean error; deviations come from the per-token KV formula, not the pool size prediction. +- **Weight memory is accurate**: mean abs error +0.84%, computed directly from safetensors parameter counts. +- **KV cache memory is close**: +0.89% mean error across all runs; -6.96% at baseline. Errors are small and consistent. +- **Activation is the dominant error source**: mean +212.88% (over-estimate). The planner uses empirical constants calibrated against vLLM v0.16.0 or earlier; v0.17.0 introduced a ~60% reduction in reported activation overhead that persists through v0.19.0. See Part 2. +- **Max concurrency tracks KV accuracy**: +3.68% mean error; deviations come from the per-token KV formula, not the pool size prediction. ### Per-Model Results — Baseline (TP=1, PP=1, DP=1, len=8192, no quantization) @@ -245,7 +245,7 @@ Runtime `--quantization fp8` compresses weights on-the-fly. vLLM logs the post-c ## Run Matrix — vLLM v0.19.0 / H100-80GB -**58 successful runs, 7 failed runs.** +**60 successful runs, 15 failed runs.** Quantization abbreviations: `ct` = compressed-tensors, `gptq` = gptq_marlin, `fp8` = fp8 inline, `mxfp4` = mx-format fp4, `—` = none. From 2fe116c3688cc289f1b105679ebb79f3be2167fb Mon Sep 17 00:00:00 2001 From: Jing Chen Date: Thu, 30 Apr 2026 15:45:47 -0400 Subject: [PATCH 20/24] Address feedbcak Signed-off-by: Jing Chen --- accuracy/accuracy_report.md | 19 +++++++-- accuracy/blog-gpu-capacity.md | 79 ++++++++++++++--------------------- accuracy/scripts/sweep.yaml | 7 ++++ 3 files changed, 54 insertions(+), 51 deletions(-) diff --git a/accuracy/accuracy_report.md b/accuracy/accuracy_report.md index dc939f7d..8e5c3e9e 100644 --- a/accuracy/accuracy_report.md +++ b/accuracy/accuracy_report.md @@ -74,7 +74,7 @@ Covers 49 runs across 28 models using only parameters the planner currently acce ### Sensitivity: Pipeline Parallelism (PP) -Model: meta-llama/Llama-3.1-8B-Instruct, TP=1, len=8192 +**Llama-3.1-8B-Instruct, TP=1, len=8192** (32 layers) | PP | Actual weight (GiB) | Weight err | Actual activ (GiB) | Activation err | Actual non-torch (GiB) | Non-torch err | KV cache err | |:--:|:-------------------:|:----------:|:------------------:|:--------------:|:---------------------:|:-------------:|:------------:| @@ -82,8 +82,17 @@ Model: meta-llama/Llama-3.1-8B-Instruct, TP=1, len=8192 | 2 | 7.51 | -0.42% | 1.10 | +336.36% | 0.07 | +114.29% | -0.85% | | 4 | 4.26 | -12.22% | 1.05 | +357.14% | 0.07 | +114.29% | +1.59% | -- **Activation drops with PP**: PP=1 → 1.89 GiB, PP=2 → 1.10 GiB, PP=4 → 1.05 GiB. The formula always predicts 4.80 GiB regardless of PP. -- **Weight error grows with PP**: layer imbalance across stages causes the formula (which assumes uniform distribution) to deviate at high PP. +**Qwen3-8B, TP=1, len=8192** (36 layers; PP=3 gives exactly 12 layers per stage) + +| PP | Actual weight (GiB) | Weight err | Actual activ (GiB) | Activation err | Actual non-torch (GiB) | Non-torch err | KV cache err | +|:--:|:-------------------:|:----------:|:------------------:|:--------------:|:---------------------:|:-------------:|:------------:| +| 1 | 15.27 | -0.09% | 2.21 | +153.39% | 0.25 | -40.00% | -4.36% | +| 3 | 5.48 | -7.20% | 0.96 | +483.33% | 0.07 | +114.29% | +0.74% | + +- **PP=3 (odd) works**: vLLM v0.19.0 supports non-power-of-2 pipeline stages without issues. +- **Activation drops with PP**: follows the same pattern across both models; the formula always predicts the PP=1 constant regardless of PP. +- **Weight error grows with PP even with perfectly divisible layers**: Qwen3-8B has 36 layers ÷ 3 = exactly 12 per stage, yet weight error is -7.20%. The formula assumes all layers are equal size; embedding and LM-head layers are not evenly distributed across stages, adding per-stage overhead the formula misses. +- **KV cache error is small and positive at PP≥2**: +0.74% to +1.59% across both models. ### Sensitivity: Context Length (max_model_len) @@ -139,6 +148,7 @@ Re-calibrating these constants against v0.19.0 measurements is the highest-value |:--:|:--:|:-------------------:|:-------------------:|:----------:| | 1 | 1 | 0.15 | 0.27 | -42.17% | | 1 | 2 | 0.15 | 0.07 | +114.29% | +| 1 | 3 | 0.15 | 0.07 | +114.29% | | 1 | 4 | 0.15 | 0.07 | +114.29% | | 2 | 1 | 0.60 | 2.08 | -71.15% | | 4 | 1 | 0.60 | 2.17 | -72.34% | @@ -245,7 +255,7 @@ Runtime `--quantization fp8` compresses weights on-the-fly. vLLM logs the post-c ## Run Matrix — vLLM v0.19.0 / H100-80GB -**60 successful runs, 15 failed runs.** +**61 successful runs, 15 failed runs.** Quantization abbreviations: `ct` = compressed-tensors, `gptq` = gptq_marlin, `fp8` = fp8 inline, `mxfp4` = mx-format fp4, `—` = none. @@ -305,6 +315,7 @@ Vision/multi-modal models in this sweep: `moonshotai/Kimi-VL-A3B-Instruct` (visi | Qwen/Qwen3-30B-A3B | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.0% | +198.5% | -44.4% | -28.8% | | Qwen/Qwen1.5-MoE-A2.7B | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.0% | +223.9% | -40.0% | -10.1% | | Qwen/Qwen3-8B | 1 | 1 | 1 | 8192 | bf16 | — | auto | -0.1% | +153.4% | -40.0% | -4.4% | +| Qwen/Qwen3-8B | 1 | 3 | 1 | 8192 | bf16 | — | auto | -7.2% | +483.3% | +114.3% | +0.7% | | RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic | 2 | 1 | 1 | 8192 | bf16 | ct | auto | -0.1% | +144.9% | -71.4% | +5.0% | | RedHatAI/Llama-3.3-70B-Instruct-fp8-dynamic | 4 | 1 | 1 | 8192 | bf16 | ct | auto | -0.2% | +143.7% | -72.9% | +5.9% | | redhatai/Llama-3.3-70B-Instruct-quantized.w8a8 | 2 | 1 | 1 | 8192 | bf16 | ct | auto | -0.1% | +144.9% | -71.6% | +5.0% | diff --git a/accuracy/blog-gpu-capacity.md b/accuracy/blog-gpu-capacity.md index 2fdabcaa..7bc958b1 100644 --- a/accuracy/blog-gpu-capacity.md +++ b/accuracy/blog-gpu-capacity.md @@ -1,64 +1,48 @@ -# 60 Experiments Later: What We Learned About LLM Memory Prediction +# 61 Experiments Later: What We Learned About LLM Memory Prediction -*GPU memory planning for LLM deployments is still mostly guesswork. Here's what we learned from measuring it empirically across 35 architectures.* +*GPU memory estimation for LLM deployments is still mostly guesswork, and it's the step you have to get right before anything else. Here's what we learned from measuring it empirically across 35 architectures.* --- -You're standing up a benchmark suite and need to know how many GPUs each model configuration requires before sizing the cluster. Or you're launching a serving application and want to plan capacity without over-provisioning by 3x. Or you're a researcher asking whether two H100s will be enough for a 70B model, or whether you need four. +You're planning a benchmark suite and need to know how many GPUs each model requires before sizing the cluster. You're launching a serving application and want to avoid over-provisioning by 3x. You're a researcher asking whether two H100s will be enough for a 70B model. -In all of these cases, the question is the same: **how much GPU memory will this actually need?** +The question is the same: **how much GPU memory will this actually need?** -Most teams answer it by copying what someone else deployed, or by spinning up the pod, watching it OOM, and doubling the resources. This works, but it gets harder as models grow larger and serving configurations more complex. Tensor parallelism, pipeline parallelism, quantization, and long-context windows all change the memory footprint in non-obvious ways. +Memory is the first gate. Either the model fits or it doesn't, and the only way to find out without a prediction tool is to spin up a vLLM server and see if it OOMs. Tensor parallelism, pipeline parallelism, quantization, and long-context windows all change the footprint in non-obvious ways, which makes trial-and-error expensive. This post is about memory estimation specifically: not throughput, latency, or any other performance metric. Those questions are downstream; this one comes first. -[llm-d-planner](https://github.com/llm-d-incubation/llm-d-planner) is an open-source library that guides LLM deployments from concept to production. One of its core submodules is a capacity planner built to answer this question before you deploy. To make sure it wasn't just replacing guesswork with a false sense of precision, we ran 60 experiments on H100 GPUs to validate its predictions against reality. Here's what we found, and why we're asking the community to help make it even better. +[llm-d-planner](https://github.com/llm-d-incubation/llm-d-planner) includes a pip-installable capacity planner that answers this question before you touch a cluster, using only model config files and safetensor headers. To verify it wasn't replacing guesswork with false precision, we ran 61 experiments on H100 GPUs. Here's what we found. --- -## What llm-d-planner Does +## How the Planner Works -llm-d-planner guides LLM deployments from concept to production: conversational requirements gathering, SLO-driven model and GPU recommendations, what-if analysis, one-click Kubernetes config generation, and monitoring. The capacity planner is a pip-installable subcomponent that focuses on one question: how much GPU memory will this deployment actually need. +Memory breaks into four components: weights, KV cache, activation memory, and non-torch overhead (CUDA runtime + NCCL buffers for multi-GPU). The planner reads `config.json` and safetensor headers for weights, reverse-engineers vLLM's KV cache allocation strategy, and uses empirically measured per-architecture constants for activation memory. No GPU required. -It breaks memory into four components: weights, KV cache, activation memory, and non-torch overhead (CUDA runtime and NCCL buffers for multi-GPU). Each scales differently with tensor parallelism, context length, and quantization, so knowing which component is driving your footprint tells you what to actually change. For each component, the planner anchors to a source of truth wherever one exists: `config.json` and safetensor file headers for weights, vLLM's allocation strategy for KV cache, and empirically measured constants for things that can't be derived analytically, like activation memory. The experiment in this post is how those constants are kept honest. - -Accurate memory prediction is also the prerequisite for any infrastructure decision: choosing a deployment topology, sizing a cluster, or selecting between serving strategies. For users of [llm-d](https://llm-d.ai), for example, it's the step that narrows down which well-lit path is appropriate for a given use case before any hardware is provisioned. +**Known gaps:** fp8 KV cache dtype and runtime fp8 quantization are not yet modeled. These can cut memory by 40-50%, so the planner will over-estimate for those configurations without issuing a warning. Float32 dtype overrides are also unsupported. If you're running fp8-quantized models today, treat the output as a baseline upper bound. --- -## The Experiment: Trusting but Verifying +## The Experiment -Claiming a tool is accurate is easy. Measuring it is harder. We launched vLLM servers across 60 configurations on H100-80GB GPUs, captured the full startup logs for each, and parsed the actual memory measurements reported by vLLM at initialization. We then compared those measurements against llm-d-planner's predictions for every configuration. The sweep covered: +We launched vLLM servers across 61 configurations on H100-80GB GPUs, captured startup logs, and compared measured memory against predictions per component. The sweep covered: -- **35 model architectures**: Llama, Qwen, Gemma, Granite, Mistral, DeepSeek, Phi, Mixtral, and multimodal models including LLaVA, Kimi-VL, and MiMo -- **Tensor parallelism** (TP 1, 2, 4) and **pipeline parallelism** (PP 1, 2, 4) +- **35 model architectures**: Llama, Qwen, Gemma, Granite, Mistral, DeepSeek, Phi, Mixtral, multimodal models (LLaVA, Kimi-VL, MiMo) +- **Tensor parallelism** (TP 1, 2, 4) and **pipeline parallelism** (PP 1, 2, 3, 4) - **Context lengths** from 2,048 to 32,768 tokens -- **Dtype and quantization variants**: bfloat16, float16; compressed-tensors, GPTQ -- **vLLM version sensitivity**: Qwen3-14B across v0.15.0 through v0.19.0 to track how memory behavior changes across releases +- **Dtype and quantization**: bfloat16, float16; compressed-tensors, GPTQ +- **vLLM version sensitivity**: Qwen3-14B across v0.15.0-v0.19.0 -For each run, we compared predicted values against the four measured memory components independently. The [raw logs and run JSON files](https://drive.google.com/drive/folders/1a0y2gdhcpKcFxm4RsqXUKWW40Gpd2Kx5) are published for reference, and the analysis is fully reproducible locally without cluster access. +[Raw logs and run JSON files](https://drive.google.com/drive/folders/1a0y2gdhcpKcFxm4RsqXUKWW40Gpd2Kx5) are published; the analysis is reproducible locally without cluster access. --- ## What We Found -### The headline: accurate where it counts most - -**Weight memory: 0.84% mean absolute error** across 49 of the 60 runs. (The remaining 11 used parameters the planner doesn't yet model—float32 dtype, runtime fp8 quantization, and fp8 KV cache dtype—and are discussed below.) This is the single largest memory component; for a model like Llama-3.1-8B at TP=1, weights consume about 15 GiB of the 79 GiB available. It's also the hardest to get right across a diverse model set. - -Weight prediction is harder than it looks: dense, MoE, multi-head latent attention, and vision-language models all organize parameters differently, quantization changes the bytes-per-parameter, and TP sharding depends on how dimensions divide across ranks. The formula handles all of this by reading `config.json` for architecture parameters and safetensor headers for exact tensor shapes, giving precise counts without downloading the full model and making it generalizable to any model on HuggingFace beyond the 35 we explicitly tested. Across dense, MoE, multimodal, and quantized architectures, it held to under 1% error. - -**KV cache memory: 0.89% mean error** across all runs, and −6.96% at baseline (TP=PP=DP=1, 8K context, no quantization). This is the component that matters most for capacity planning, as it determines your maximum concurrent token budget. For Llama-3.1-8B at TP=1 with 8K context, that's roughly 58 GiB of KV pool, and we're within a few percent across every context length we tested. - -One insight worth pausing on: the KV pool size is *independent* of `max_model_len`. vLLM sizes the pool from whatever memory remains after weights and activation are allocated, then figures out how many tokens fit given the per-token KV size for that architecture. This means setting a longer context window doesn't shrink your KV pool; it just means each request can use a larger share of it, leaving less headroom for concurrent requests at the maximum context length. Tools that pre-allocate based on `max_model_len` will over-estimate memory for long-context configs and leave capacity on the table. +**Weight memory: 0.84% mean error** across 49 of the 61 runs (the remaining 12 used fp8/float32 configurations not yet modeled). Weights are the largest single component; for Llama-3.1-8B at TP=1, that's ~15 GiB of 79 GiB available. The formula handles dense, MoE, multimodal, and quantized architectures by reading exact tensor shapes from safetensor headers, making it generalizable to any HuggingFace model beyond the 35 tested. -These two components together typically account for 90%+ of total GPU memory consumption. Getting them right is what makes the planner useful in practice. +**KV cache memory: 0.89% mean error** across all runs. This is the component that determines maximum concurrent token budget. One insight worth flagging: the KV pool size is *independent* of `max_model_len`. vLLM sizes the pool from whatever memory remains after weights and activations, then determines how many tokens fit. Setting a longer context window doesn't shrink your pool; it just means each request consumes a larger share, reducing concurrency at max context length. -### The honest part: smaller components, real errors - -**Activation memory** showed a mean error of +212.88%. That sounds alarming, so let's ground it in absolute numbers. For Llama-3.1-8B at TP=1, our formula predicted 4.80 GiB; vLLM v0.19.0 actually used 1.89 GiB, an over-estimate of about 2.9 GiB. On a GPU with 79 GiB of VRAM where weights alone consume 15 GiB and the KV pool takes 58 GiB, a 2.9 GiB error in a smaller component is meaningful but bounded. - -The root cause is more interesting than the magnitude: **vLLM v0.17.0 quietly reduced activation memory by ~60%, and we didn't notice.** - -Our version sensitivity study tells the story clearly: +**Activation memory: +212% mean error**, but in absolute terms that's a ~2.9 GiB over-estimate on a 79 GiB GPU. The more interesting finding is the root cause: **vLLM v0.17.0 reduced activation memory by ~60%, and we didn't notice.** | vLLM version | Activation (Qwen3-14B) | |:---:|:---:| @@ -68,29 +52,30 @@ Our version sensitivity study tells the story clearly: | v0.18.0 | 2.23 GiB | | v0.19.0 | ~2.21 GiB | -The planner's Qwen3 activation constant was 5.60 GiB, a near-exact match for v0.16.0. Our constants had been calibrated against an older vLLM release and were never updated as vLLM evolved. The 60% reduction at v0.17.0 freed memory that vLLM reallocated to the KV cache, actually *improving* serving capacity, but our planner didn't know about it. +Our constants were calibrated against v0.16.0 and never updated. The reduction freed memory that vLLM reallocated to the KV cache, actually improving serving capacity, but our planner was blind to it. Re-calibrating against v0.19.0 measurements is the highest-priority fix; contributions are welcome. The planner is not version-aware for older releases; if you're running an earlier vLLM in production, expect activation estimates to diverge. -This kind of silent drift is precisely why empirical validation matters. We didn't catch it until we ran the experiments, and the fix was straightforward once we knew where to look: re-calibrate every architecture constant against v0.19.0 measurements. That's now done, and the updated constants are in the library. +**Non-torch overhead** was under-estimated by 44% on average: small at TP=1 (~0.25 GiB actual vs 0.15 GiB predicted), more meaningful at TP>=2 (~2.1 GiB actual vs 0.60 GiB predicted). The sweep also caught a correctness bug: `find_possible_tp` wasn't verifying that TP values divide `vocab_size`, which could cause vLLM to reject configurations the planner suggested as valid. Fixed. -The planner currently tracks the behavior of the latest supported vLLM release (v0.19.0). It is not version-aware in the sense that it won't automatically adjust for older releases. When vLLM changes memory behavior in a future release, re-running the accuracy sweep and submitting a PR with updated constants is how the library stays current—which is exactly the kind of contribution the community campaign is designed to support. +Additional findings from the full data: -**Non-torch overhead** (CUDA runtime + NCCL buffers) was under-estimated by 44% on average. At TP=1, this is a small absolute amount (~0.25 GiB actual vs 0.15 GiB predicted). At TP>=2, NCCL all-reduce buffers push actual overhead to ~2.1 GiB per GPU versus our constant of 0.60 GiB, a more meaningful gap. Updated multi-GPU constants are also in. +- **Max concurrency tracked KV accuracy (3.68% mean error).** The planner predicts how many concurrent requests fit at a given context length (the number most teams actually want), and it inherits the KV cache error directly, since concurrency is just KV tokens divided by `max_model_len`. +- **Activation error varies significantly by architecture.** Granite was +633%, Mistral3 was only +23%. The v0.17.0 reduction wasn't uniform; some architectures were hit harder, and the per-architecture constants need to be re-measured individually. +- **Context length has zero effect on the KV pool size.** KV GiB was identical to two decimal places from 2K to 32K tokens across both Llama and Qwen models, confirming that `max_model_len` controls request size, not pool allocation. +- **Pipeline parallelism introduces weight error even with perfectly divisible layers.** Qwen3-8B has 36 layers; PP=3 gives exactly 12 per stage, yet weight error was -7.20%. The formula assumes all layers are equal size, but embedding and LM-head layers aren't evenly distributed across pipeline stages. +- **Valid TP must divide both `num_attention_heads` and `vocab_size`.** vLLM shards the embedding and LM-head across TP ranks, so a TP value that doesn't divide `vocab_size` will be rejected at startup even if it evenly divides the attention heads. Qwen3-14B has 40 heads (TP=5 looks valid) but `vocab_size=151936` is not divisible by 5, so vLLM rejects it. The planner was only checking attention heads; the fix is to return divisors of `gcd(num_attention_heads, vocab_size)`. +- **`kv_cache_dtype fp8` doubles token capacity but leaves the KV pool size in GiB unchanged.** fp8 halves per-token storage, so twice as many tokens fit in the same memory pool, but the pool size in GiB is unaffected. The planner doesn't yet accept `kv_cache_dtype` as an input, so it under-estimates token count by ~2x for fp8 KV configurations while getting the GiB right. +- **Runtime quantization and dtype overrides cause large weight errors.** `--quantization fp8` compresses weights on-the-fly to ~half the BF16 size, but the planner reads the HuggingFace config (which has no quantization entry) and predicts full BF16 weights, resulting in a +76% weight over-estimate. Conversely, `--dtype float32` doubles weight memory; the planner reads the model's native BF16 dtype and under-estimates by -50%. Both are unsupported inputs today. -There are a few configurations the experiment didn't cover that the planner doesn't yet model: fp8 KV cache dtype (halves per-token storage, roughly doubling token capacity), float32 dtype overrides (doubles weight memory), runtime fp8 quantization, and data parallelism. For entirely unknown precision types, the planner raises an error. For these specific gaps, the planner will produce an estimate using the base model configuration without accounting for the override—meaning results may be off for that component without an explicit warning. These are real gaps for anyone running quantized production models today, and they're actively being worked on — contributions are welcome if you need one of these sooner. The sweep also turned up a subtle correctness bug in `find_possible_tp`: it wasn't verifying that TP values divide `vocab_size`, which can cause vLLM to reject a configuration the planner suggests as valid. That's fixed. +For the complete per-model and per-configuration breakdown (TP, PP, quantization, and context length sensitivity tables), see the [full accuracy report](https://github.com/llm-d-incubation/llm-d-planner/blob/main/accuracy/accuracy_report.md). --- ## Join the Community -We covered 35 architectures. The LLM landscape releases more every week, and vLLM will keep evolving. Accuracy at a point in time isn't enough; what matters is having a community that keeps the constants current as things change. - -**If your model isn't covered, or a new architecture ships with memory optimizations** (a new attention variant, a custom KV cache layout, or a novel quantization scheme), llm-d-planner should be where those updated constants land first. The sweep runner in `accuracy/` is fully documented and self-contained; run it against your own cluster, submit the results as a PR, and everyone who installs the library gets the improvement. - -**Get started:** +We covered 35 architectures. The LLM landscape ships new ones every week, and vLLM keeps evolving. The sweep runner in `accuracy/` is self-contained; run it against your cluster, submit a PR, and everyone gets the improvement. - [GitHub: llm-d-incubation/llm-d-planner](https://github.com/llm-d-incubation/llm-d-planner) -- [Accuracy campaign results and methodology](https://github.com/llm-d-incubation/llm-d-planner/tree/main/accuracy) +- [Full accuracy report with per-model tables](https://github.com/llm-d-incubation/llm-d-planner/blob/main/accuracy/accuracy_report.md) - [Run the sweep on your own cluster](https://github.com/llm-d-incubation/llm-d-planner/blob/main/accuracy/README.md) -- Open an issue or PR; contributions welcome No one should have to guess how many GPUs they need. diff --git a/accuracy/scripts/sweep.yaml b/accuracy/scripts/sweep.yaml index 2d25af75..54172ddf 100644 --- a/accuracy/scripts/sweep.yaml +++ b/accuracy/scripts/sweep.yaml @@ -147,6 +147,13 @@ runs: pp: [2, 4] _sweep_dim: pp + # Qwen3-8B: 36 layers; pp=3 tests odd pipeline stage count (36 / 3 = 12 layers/stage, exactly even). + # pp=1 baseline is covered by the Qwen3-8B core run above. + - model: Qwen/Qwen3-8B + tp: 1 + pp: 3 + _sweep_dim: pp + # ── Sensitivity: context length (max_model_len) ───────────────────────── # len=8192 baseline is covered by the tp sweep entries above. # KV pool (GiB) is independent of max_model_len; only token count changes. From 427ef31e13a16a16887cbdca2f21720701c1b9fb Mon Sep 17 00:00:00 2001 From: Jing Chen Date: Thu, 30 Apr 2026 15:49:53 -0400 Subject: [PATCH 21/24] Simplification Signed-off-by: Jing Chen --- accuracy/blog-gpu-capacity.md | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/accuracy/blog-gpu-capacity.md b/accuracy/blog-gpu-capacity.md index 7bc958b1..82f08405 100644 --- a/accuracy/blog-gpu-capacity.md +++ b/accuracy/blog-gpu-capacity.md @@ -38,11 +38,11 @@ We launched vLLM servers across 61 configurations on H100-80GB GPUs, captured st ## What We Found -**Weight memory: 0.84% mean error** across 49 of the 61 runs (the remaining 12 used fp8/float32 configurations not yet modeled). Weights are the largest single component; for Llama-3.1-8B at TP=1, that's ~15 GiB of 79 GiB available. The formula handles dense, MoE, multimodal, and quantized architectures by reading exact tensor shapes from safetensor headers, making it generalizable to any HuggingFace model beyond the 35 tested. +**Weight memory: 0.84% mean error** across 49 of the 61 runs (the remaining 12 used fp8/float32 configurations not yet modeled). For Llama-3.1-8B at TP=1, weights consume ~15 GiB of 79 GiB available. The formula reads exact tensor shapes from safetensor headers, making it generalizable to any HuggingFace model beyond the 35 tested. -**KV cache memory: 0.89% mean error** across all runs. This is the component that determines maximum concurrent token budget. One insight worth flagging: the KV pool size is *independent* of `max_model_len`. vLLM sizes the pool from whatever memory remains after weights and activations, then determines how many tokens fit. Setting a longer context window doesn't shrink your pool; it just means each request consumes a larger share, reducing concurrency at max context length. +**KV cache memory: 0.89% mean error** across all runs. This determines your maximum concurrent token budget. The KV pool size is *independent* of `max_model_len`; vLLM sizes the pool from leftover memory after weights and activations, so a longer context window reduces concurrency rather than expanding the pool. -**Activation memory: +212% mean error**, but in absolute terms that's a ~2.9 GiB over-estimate on a 79 GiB GPU. The more interesting finding is the root cause: **vLLM v0.17.0 reduced activation memory by ~60%, and we didn't notice.** +**Activation memory: +212% mean error**, but in absolute terms that's a ~2.9 GiB over-estimate on a 79 GiB GPU. The root cause: **vLLM v0.17.0 reduced activation memory by ~60%, and we didn't notice.** | vLLM version | Activation (Qwen3-14B) | |:---:|:---:| @@ -52,21 +52,17 @@ We launched vLLM servers across 61 configurations on H100-80GB GPUs, captured st | v0.18.0 | 2.23 GiB | | v0.19.0 | ~2.21 GiB | -Our constants were calibrated against v0.16.0 and never updated. The reduction freed memory that vLLM reallocated to the KV cache, actually improving serving capacity, but our planner was blind to it. Re-calibrating against v0.19.0 measurements is the highest-priority fix; contributions are welcome. The planner is not version-aware for older releases; if you're running an earlier vLLM in production, expect activation estimates to diverge. +Our constants were calibrated against v0.16.0 and never updated. Re-calibrating against v0.19.0 is the highest-priority fix; contributions are welcome. The planner is not version-aware for older releases; if you're running an earlier vLLM in production, expect activation estimates to diverge. -**Non-torch overhead** was under-estimated by 44% on average: small at TP=1 (~0.25 GiB actual vs 0.15 GiB predicted), more meaningful at TP>=2 (~2.1 GiB actual vs 0.60 GiB predicted). The sweep also caught a correctness bug: `find_possible_tp` wasn't verifying that TP values divide `vocab_size`, which could cause vLLM to reject configurations the planner suggested as valid. Fixed. +**Non-torch overhead** was under-estimated by 44% on average: small at TP=1 (~0.25 GiB actual vs 0.15 GiB predicted), more meaningful at TP>=2 (~2.1 GiB actual vs 0.60 GiB predicted). -Additional findings from the full data: +Additional findings: -- **Max concurrency tracked KV accuracy (3.68% mean error).** The planner predicts how many concurrent requests fit at a given context length (the number most teams actually want), and it inherits the KV cache error directly, since concurrency is just KV tokens divided by `max_model_len`. -- **Activation error varies significantly by architecture.** Granite was +633%, Mistral3 was only +23%. The v0.17.0 reduction wasn't uniform; some architectures were hit harder, and the per-architecture constants need to be re-measured individually. -- **Context length has zero effect on the KV pool size.** KV GiB was identical to two decimal places from 2K to 32K tokens across both Llama and Qwen models, confirming that `max_model_len` controls request size, not pool allocation. -- **Pipeline parallelism introduces weight error even with perfectly divisible layers.** Qwen3-8B has 36 layers; PP=3 gives exactly 12 per stage, yet weight error was -7.20%. The formula assumes all layers are equal size, but embedding and LM-head layers aren't evenly distributed across pipeline stages. -- **Valid TP must divide both `num_attention_heads` and `vocab_size`.** vLLM shards the embedding and LM-head across TP ranks, so a TP value that doesn't divide `vocab_size` will be rejected at startup even if it evenly divides the attention heads. Qwen3-14B has 40 heads (TP=5 looks valid) but `vocab_size=151936` is not divisible by 5, so vLLM rejects it. The planner was only checking attention heads; the fix is to return divisors of `gcd(num_attention_heads, vocab_size)`. -- **`kv_cache_dtype fp8` doubles token capacity but leaves the KV pool size in GiB unchanged.** fp8 halves per-token storage, so twice as many tokens fit in the same memory pool, but the pool size in GiB is unaffected. The planner doesn't yet accept `kv_cache_dtype` as an input, so it under-estimates token count by ~2x for fp8 KV configurations while getting the GiB right. -- **Runtime quantization and dtype overrides cause large weight errors.** `--quantization fp8` compresses weights on-the-fly to ~half the BF16 size, but the planner reads the HuggingFace config (which has no quantization entry) and predicts full BF16 weights, resulting in a +76% weight over-estimate. Conversely, `--dtype float32` doubles weight memory; the planner reads the model's native BF16 dtype and under-estimates by -50%. Both are unsupported inputs today. +- **Activation error varies by architecture.** Granite was +633%, Mistral3 was only +23%. The v0.17.0 reduction wasn't uniform; per-architecture constants need to be re-measured individually. +- **Valid TP must divide both `num_attention_heads` and `vocab_size`.** Qwen3-14B has 40 attention heads, making TP=5 look valid, but `vocab_size=151936` is not divisible by 5 and vLLM rejects it at startup. The planner was only checking attention heads; the fix is to return divisors of `gcd(num_attention_heads, vocab_size)`. +- **`kv_cache_dtype fp8` doubles token capacity but leaves the KV pool in GiB unchanged.** fp8 halves per-token storage, so twice as many tokens fit in the same pool. The planner doesn't yet model this, so it under-estimates token count by ~2x for fp8 KV configurations while getting the GiB right. -For the complete per-model and per-configuration breakdown (TP, PP, quantization, and context length sensitivity tables), see the [full accuracy report](https://github.com/llm-d-incubation/llm-d-planner/blob/main/accuracy/accuracy_report.md). +For the complete per-model breakdown, see the [full accuracy report](https://github.com/llm-d-incubation/llm-d-planner/blob/main/accuracy/accuracy_report.md). --- From 91397a2a2f66069ad701321dcd859264123b7602 Mon Sep 17 00:00:00 2001 From: Jing Chen Date: Thu, 30 Apr 2026 19:38:52 -0400 Subject: [PATCH 22/24] Several revisions Signed-off-by: Jing Chen --- accuracy/blog-gpu-capacity.md | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/accuracy/blog-gpu-capacity.md b/accuracy/blog-gpu-capacity.md index 82f08405..458c0e4a 100644 --- a/accuracy/blog-gpu-capacity.md +++ b/accuracy/blog-gpu-capacity.md @@ -1,6 +1,6 @@ -# 61 Experiments Later: What We Learned About LLM Memory Prediction +# 61 Experiments Later: What We Learned About LLM Memory Estimation -*GPU memory estimation for LLM deployments is still mostly guesswork, and it's the step you have to get right before anything else. Here's what we learned from measuring it empirically across 35 architectures.* +*GPU memory estimation for LLM deployments is still mostly guesswork. llm-d-planner is designed to solve it. Here's what validating its estimates against 61 experiments across 35 architectures taught us, and why empirical grounding is what separates a useful tool from a confident guess.* --- @@ -8,7 +8,7 @@ You're planning a benchmark suite and need to know how many GPUs each model requ The question is the same: **how much GPU memory will this actually need?** -Memory is the first gate. Either the model fits or it doesn't, and the only way to find out without a prediction tool is to spin up a vLLM server and see if it OOMs. Tensor parallelism, pipeline parallelism, quantization, and long-context windows all change the footprint in non-obvious ways, which makes trial-and-error expensive. This post is about memory estimation specifically: not throughput, latency, or any other performance metric. Those questions are downstream; this one comes first. +Memory is the first gate. Either the model fits or it doesn't, and the only way to find out without a memory planning tool is to spin up a vLLM server and see if it OOMs. Tensor parallelism, pipeline parallelism, quantization, and long-context windows all change the footprint in non-obvious ways, which makes trial-and-error expensive. This post is about memory estimation specifically: not throughput, latency, or any other performance metric. Before you can answer any question about performance, you need to answer this one: does the model fit? [llm-d-planner](https://github.com/llm-d-incubation/llm-d-planner) includes a pip-installable capacity planner that answers this question before you touch a cluster, using only model config files and safetensor headers. To verify it wasn't replacing guesswork with false precision, we ran 61 experiments on H100 GPUs. Here's what we found. @@ -16,15 +16,15 @@ Memory is the first gate. Either the model fits or it doesn't, and the only way ## How the Planner Works -Memory breaks into four components: weights, KV cache, activation memory, and non-torch overhead (CUDA runtime + NCCL buffers for multi-GPU). The planner reads `config.json` and safetensor headers for weights, reverse-engineers vLLM's KV cache allocation strategy, and uses empirically measured per-architecture constants for activation memory. No GPU required. +Memory breaks into four components: weights, KV cache, activation memory, and non-torch overhead (CUDA runtime + NCCL buffers for multi-GPU). The theoretical approach is standard: weight memory from parameter counts and dtype, KV cache from attention head dimensions and context length. Any careful engineer would use the same formulas. What makes this different is that we measured actual vLLM behavior across 61 configurations to validate the constants and catch where theory diverges from reality. The result: **weight and KV cache errors under 1%** across standard bfloat16/float16 configurations on H100-80GB. These two components account for 90%+ of total GPU memory. No GPU is required at planning time; all constants are derived from prior empirical measurements on real hardware. -**Known gaps:** fp8 KV cache dtype and runtime fp8 quantization are not yet modeled. These can cut memory by 40-50%, so the planner will over-estimate for those configurations without issuing a warning. Float32 dtype overrides are also unsupported. If you're running fp8-quantized models today, treat the output as a baseline upper bound. +> **Current limitations:** All constants are calibrated on H100-80GB; other GPU types may differ, especially in non-torch overhead. fp8 KV cache dtype and runtime fp8 quantization are not yet modeled and can cut memory by 40-50%, so the planner will over-estimate for those configurations without warning. Float32 dtype overrides are also unsupported. Contributions covering additional GPU types or missing quantization modes are welcome. --- ## The Experiment -We launched vLLM servers across 61 configurations on H100-80GB GPUs, captured startup logs, and compared measured memory against predictions per component. The sweep covered: +We launched vLLM servers across 61 configurations on H100-80GB GPUs, captured startup logs, and compared measured memory against estimates per component. The sweep covered: - **35 model architectures**: Llama, Qwen, Gemma, Granite, Mistral, DeepSeek, Phi, Mixtral, multimodal models (LLaVA, Kimi-VL, MiMo) - **Tensor parallelism** (TP 1, 2, 4) and **pipeline parallelism** (PP 1, 2, 3, 4) @@ -52,13 +52,13 @@ We launched vLLM servers across 61 configurations on H100-80GB GPUs, captured st | v0.18.0 | 2.23 GiB | | v0.19.0 | ~2.21 GiB | -Our constants were calibrated against v0.16.0 and never updated. Re-calibrating against v0.19.0 is the highest-priority fix; contributions are welcome. The planner is not version-aware for older releases; if you're running an earlier vLLM in production, expect activation estimates to diverge. +Our constants reflected v0.16.0 behavior and weren't updated when vLLM changed. The v0.17.0 optimization is exactly the kind of significant change that warrants revisiting the model, not as a routine calibration exercise, but because the underlying behavior shifted materially. The longer-term goal is to derive activation memory from first principles so estimates don't depend on vLLM version at all. If you're running an earlier vLLM release, expect activation estimates to diverge from current behavior. -**Non-torch overhead** was under-estimated by 44% on average: small at TP=1 (~0.25 GiB actual vs 0.15 GiB predicted), more meaningful at TP>=2 (~2.1 GiB actual vs 0.60 GiB predicted). +**Non-torch overhead** was under-estimated by 44% on average: small at TP=1 (~0.25 GiB actual vs 0.15 GiB predicted), more meaningful at TP>=2 (~2.1 GiB actual vs 0.60 GiB estimated). Additional findings: -- **Activation error varies by architecture.** Granite was +633%, Mistral3 was only +23%. The v0.17.0 reduction wasn't uniform; per-architecture constants need to be re-measured individually. +- **Activation error varies by architecture.** Granite was +633%, Mistral3 was only +23%. The v0.17.0 reduction wasn't uniform, which points to architecture-specific behavior that a stronger theoretical model should account for directly rather than through per-family constants. - **Valid TP must divide both `num_attention_heads` and `vocab_size`.** Qwen3-14B has 40 attention heads, making TP=5 look valid, but `vocab_size=151936` is not divisible by 5 and vLLM rejects it at startup. The planner was only checking attention heads; the fix is to return divisors of `gcd(num_attention_heads, vocab_size)`. - **`kv_cache_dtype fp8` doubles token capacity but leaves the KV pool in GiB unchanged.** fp8 halves per-token storage, so twice as many tokens fit in the same pool. The planner doesn't yet model this, so it under-estimates token count by ~2x for fp8 KV configurations while getting the GiB right. @@ -68,10 +68,10 @@ For the complete per-model breakdown, see the [full accuracy report](https://git ## Join the Community -We covered 35 architectures. The LLM landscape ships new ones every week, and vLLM keeps evolving. The sweep runner in `accuracy/` is self-contained; run it against your cluster, submit a PR, and everyone gets the improvement. +We covered 35 architectures. The sweep wasn't about generating constants to maintain; it was about verifying that the theoretical formulas hold and finding where they don't. If you hit a model family or configuration not covered here, or if a future vLLM release introduces a significant memory optimization, the sweep runner in `accuracy/` is self-contained and can be run to validate the current estimates against new behavior. - [GitHub: llm-d-incubation/llm-d-planner](https://github.com/llm-d-incubation/llm-d-planner) - [Full accuracy report with per-model tables](https://github.com/llm-d-incubation/llm-d-planner/blob/main/accuracy/accuracy_report.md) -- [Run the sweep on your own cluster](https://github.com/llm-d-incubation/llm-d-planner/blob/main/accuracy/README.md) -No one should have to guess how many GPUs they need. +The broader goal is infrastructure planning at day 0. Before you provision hardware, you should know whether your model fits, in what configuration, and whether your expected workload (concurrency, context length, number of replicas) is supportable on the hardware you're considering. Memory estimation is what makes that possible. The alternative is discovering at deployment time that your hardware can't support the workload you designed for, which is an expensive place to find out. Get these answers before you deploy. + From 892cd908e31d3cfc76337caf17d06f2cef85886a Mon Sep 17 00:00:00 2001 From: Jing Chen Date: Thu, 30 Apr 2026 19:53:31 -0400 Subject: [PATCH 23/24] Tone: story rather than technical report Signed-off-by: Jing Chen --- accuracy/blog-gpu-capacity.md | 52 +++++++++++++++++------------------ 1 file changed, 25 insertions(+), 27 deletions(-) diff --git a/accuracy/blog-gpu-capacity.md b/accuracy/blog-gpu-capacity.md index 458c0e4a..7a05a77d 100644 --- a/accuracy/blog-gpu-capacity.md +++ b/accuracy/blog-gpu-capacity.md @@ -16,15 +16,15 @@ Memory is the first gate. Either the model fits or it doesn't, and the only way ## How the Planner Works -Memory breaks into four components: weights, KV cache, activation memory, and non-torch overhead (CUDA runtime + NCCL buffers for multi-GPU). The theoretical approach is standard: weight memory from parameter counts and dtype, KV cache from attention head dimensions and context length. Any careful engineer would use the same formulas. What makes this different is that we measured actual vLLM behavior across 61 configurations to validate the constants and catch where theory diverges from reality. The result: **weight and KV cache errors under 1%** across standard bfloat16/float16 configurations on H100-80GB. These two components account for 90%+ of total GPU memory. No GPU is required at planning time; all constants are derived from prior empirical measurements on real hardware. +Memory breaks into four components: weights, KV cache, activation memory, and non-torch overhead. The theoretical approach is standard: weight memory from parameter counts and dtype, KV cache from attention head dimensions and context length. What makes this different is that we measured actual vLLM behavior across 61 configurations to validate those formulas and catch where theory diverges from reality. No GPU is required at planning time; all constants are grounded in prior empirical measurements on real hardware. -> **Current limitations:** All constants are calibrated on H100-80GB; other GPU types may differ, especially in non-torch overhead. fp8 KV cache dtype and runtime fp8 quantization are not yet modeled and can cut memory by 40-50%, so the planner will over-estimate for those configurations without warning. Float32 dtype overrides are also unsupported. Contributions covering additional GPU types or missing quantization modes are welcome. +**Current limitations:** Constants are calibrated on H100-80GB; other GPU types may differ. fp8 quantization modes and float32 dtype overrides are not yet modeled — treat estimates for those configurations as an upper bound. --- ## The Experiment -We launched vLLM servers across 61 configurations on H100-80GB GPUs, captured startup logs, and compared measured memory against estimates per component. The sweep covered: +We launched vLLM servers across 61 configurations on H100-80GB GPUs, captured startup logs, and compared measured memory against the planner's estimates. vLLM reports per-component memory usage at initialization, which is what made per-component error measurement possible. The sweep covered: - **35 model architectures**: Llama, Qwen, Gemma, Granite, Mistral, DeepSeek, Phi, Mixtral, multimodal models (LLaVA, Kimi-VL, MiMo) - **Tensor parallelism** (TP 1, 2, 4) and **pipeline parallelism** (PP 1, 2, 3, 4) @@ -38,40 +38,38 @@ We launched vLLM servers across 61 configurations on H100-80GB GPUs, captured st ## What We Found -**Weight memory: 0.84% mean error** across 49 of the 61 runs (the remaining 12 used fp8/float32 configurations not yet modeled). For Llama-3.1-8B at TP=1, weights consume ~15 GiB of 79 GiB available. The formula reads exact tensor shapes from safetensor headers, making it generalizable to any HuggingFace model beyond the 35 tested. +The two components that dominate GPU memory — weights and KV cache, together over 90% of total usage — came in under 1% mean error across standard configurations. Here's how that breaks down by model family: -**KV cache memory: 0.89% mean error** across all runs. This determines your maximum concurrent token budget. The KV pool size is *independent* of `max_model_len`; vLLM sizes the pool from leftover memory after weights and activations, so a longer context window reduces concurrency rather than expanding the pool. +| Family | Type | Weight error | KV error | +|---|---|:---:|:---:| +| Llama | Dense | <1% | 3–5% | +| Qwen | Dense | <1% | ~4% | +| Gemma 2 | Dense | <1% | 1–5% | +| Gemma 3 | Dense | <7%* | <2% | +| Granite | Dense | <1% | 5–6% | +| Phi | Dense | <1% | 5–7% | +| Mistral | Dense | <1% | <2% | +| Mixtral | MoE | <1% | ~2% | +| Qwen MoE | MoE | <1% | 10–29% | +| DeepSeek MLA | MoE | <1% | ~12% | +| LLaVA, Kimi-VL, MiMo-VL | Multimodal | <1% | 1–10% | -**Activation memory: +212% mean error**, but in absolute terms that's a ~2.9 GiB over-estimate on a 79 GiB GPU. The root cause: **vLLM v0.17.0 reduced activation memory by ~60%, and we didn't notice.** +\* One Gemma 3 variant (4B) had 6.65% weight error; all others under 1%. -| vLLM version | Activation (Qwen3-14B) | -|:---:|:---:| -| v0.15.0 | 5.64 GiB | -| v0.16.0 | 5.64 GiB | -| **v0.17.0** | **2.23 GiB** | -| v0.18.0 | 2.23 GiB | -| v0.19.0 | ~2.21 GiB | +Weight estimation holds consistently across all architecture types. KV cache error is higher for sparse MoE models — Qwen3-30B-A3B, where only ~10% of parameters are active per token, drives the 29% upper bound. All errors are under-estimates: the planner predicts slightly less memory than vLLM actually uses, which is the safe direction for capacity planning. -Our constants reflected v0.16.0 behavior and weren't updated when vLLM changed. The v0.17.0 optimization is exactly the kind of significant change that warrants revisiting the model, not as a routine calibration exercise, but because the underlying behavior shifted materially. The longer-term goal is to derive activation memory from first principles so estimates don't depend on vLLM version at all. If you're running an earlier vLLM release, expect activation estimates to diverge from current behavior. +**Where we got it wrong.** Activation memory showed +212% mean error, and the reason is a story worth telling. Between v0.16.0 and v0.17.0, vLLM silently reduced activation memory overhead by ~60% (from 5.64 GiB down to 2.23 GiB for Qwen3-14B), and we didn't notice until we ran these experiments. Our constants reflected the older behavior. In absolute terms the impact was ~2.9 GiB on a 79 GiB GPU: bounded, but real, and the kind of drift that's invisible without empirical validation. The right fix isn't to chase vLLM releases with updated constants; it's to derive activation memory from first principles so the estimate doesn't depend on framework internals at all. -**Non-torch overhead** was under-estimated by 44% on average: small at TP=1 (~0.25 GiB actual vs 0.15 GiB predicted), more meaningful at TP>=2 (~2.1 GiB actual vs 0.60 GiB estimated). - -Additional findings: - -- **Activation error varies by architecture.** Granite was +633%, Mistral3 was only +23%. The v0.17.0 reduction wasn't uniform, which points to architecture-specific behavior that a stronger theoretical model should account for directly rather than through per-family constants. -- **Valid TP must divide both `num_attention_heads` and `vocab_size`.** Qwen3-14B has 40 attention heads, making TP=5 look valid, but `vocab_size=151936` is not divisible by 5 and vLLM rejects it at startup. The planner was only checking attention heads; the fix is to return divisors of `gcd(num_attention_heads, vocab_size)`. -- **`kv_cache_dtype fp8` doubles token capacity but leaves the KV pool in GiB unchanged.** fp8 halves per-token storage, so twice as many tokens fit in the same pool. The planner doesn't yet model this, so it under-estimates token count by ~2x for fp8 KV configurations while getting the GiB right. - -For the complete per-model breakdown, see the [full accuracy report](https://github.com/llm-d-incubation/llm-d-planner/blob/main/accuracy/accuracy_report.md). +For the complete per-model and per-configuration breakdown, see the [full accuracy report](https://github.com/llm-d-incubation/llm-d-planner/blob/main/accuracy/accuracy_report.md). --- -## Join the Community +## Plan Before You Provision -We covered 35 architectures. The sweep wasn't about generating constants to maintain; it was about verifying that the theoretical formulas hold and finding where they don't. If you hit a model family or configuration not covered here, or if a future vLLM release introduces a significant memory optimization, the sweep runner in `accuracy/` is self-contained and can be run to validate the current estimates against new behavior. +The goal isn't a perfect number. It's a good enough answer at day 0 — before you've committed to hardware, before you've designed your deployment topology, before you've found out that the workload you planned for doesn't fit on the cluster you ordered. Memory estimation is what gives you that answer early: whether the model fits, in what configuration, and what concurrency your hardware can actually support under your expected workload. - [GitHub: llm-d-incubation/llm-d-planner](https://github.com/llm-d-incubation/llm-d-planner) - [Full accuracy report with per-model tables](https://github.com/llm-d-incubation/llm-d-planner/blob/main/accuracy/accuracy_report.md) +- [Run the sweep on your own cluster](https://github.com/llm-d-incubation/llm-d-planner/blob/main/accuracy/README.md) -The broader goal is infrastructure planning at day 0. Before you provision hardware, you should know whether your model fits, in what configuration, and whether your expected workload (concurrency, context length, number of replicas) is supportable on the hardware you're considering. Memory estimation is what makes that possible. The alternative is discovering at deployment time that your hardware can't support the workload you designed for, which is an expensive place to find out. Get these answers before you deploy. - +No one should have to spin up a cluster to find out their workload doesn't fit. From 151baf3dd6d8256305c9b3f9562e0606c94cb145 Mon Sep 17 00:00:00 2001 From: Jing Chen Date: Thu, 30 Apr 2026 20:03:44 -0400 Subject: [PATCH 24/24] Better table Signed-off-by: Jing Chen --- accuracy/blog-gpu-capacity.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/accuracy/blog-gpu-capacity.md b/accuracy/blog-gpu-capacity.md index 7a05a77d..8fcd52dc 100644 --- a/accuracy/blog-gpu-capacity.md +++ b/accuracy/blog-gpu-capacity.md @@ -38,7 +38,7 @@ We launched vLLM servers across 61 configurations on H100-80GB GPUs, captured st ## What We Found -The two components that dominate GPU memory — weights and KV cache, together over 90% of total usage — came in under 1% mean error across standard configurations. Here's how that breaks down by model family: +The two components that dominate GPU memory, weights and KV cache, account for over 90% of total usage. Weight estimation came in under 1% mean error across all architectures. KV cache error was low for dense models (typically under 5%) but higher for sparse MoE architectures. Here's how that breaks down by model family: | Family | Type | Weight error | KV error | |---|---|:---:|:---:| @@ -60,16 +60,16 @@ Weight estimation holds consistently across all architecture types. KV cache err **Where we got it wrong.** Activation memory showed +212% mean error, and the reason is a story worth telling. Between v0.16.0 and v0.17.0, vLLM silently reduced activation memory overhead by ~60% (from 5.64 GiB down to 2.23 GiB for Qwen3-14B), and we didn't notice until we ran these experiments. Our constants reflected the older behavior. In absolute terms the impact was ~2.9 GiB on a 79 GiB GPU: bounded, but real, and the kind of drift that's invisible without empirical validation. The right fix isn't to chase vLLM releases with updated constants; it's to derive activation memory from first principles so the estimate doesn't depend on framework internals at all. -For the complete per-model and per-configuration breakdown, see the [full accuracy report](https://github.com/llm-d-incubation/llm-d-planner/blob/main/accuracy/accuracy_report.md). +For the complete per-model and per-configuration breakdown, see the [full accuracy report](https://github.com/llm-d-incubation/llm-d-planner/blob/main/accuracy/accuracy_report.md). Beyond the numbers, running these experiments gave us a clearer picture of how LLMs actually behave at runtime, and those findings will inform the next round of improvements to the planner's accuracy. --- ## Plan Before You Provision -The goal isn't a perfect number. It's a good enough answer at day 0 — before you've committed to hardware, before you've designed your deployment topology, before you've found out that the workload you planned for doesn't fit on the cluster you ordered. Memory estimation is what gives you that answer early: whether the model fits, in what configuration, and what concurrency your hardware can actually support under your expected workload. +The goal isn't a perfect number. It's a good enough answer at day 0: before you've committed to hardware, before you've designed your deployment topology, before you've found out that the workload you planned for doesn't fit on the cluster you ordered. Memory estimation is what gives you that answer early: whether the model fits, in what configuration, and what concurrency your hardware can actually support under your expected workload. - [GitHub: llm-d-incubation/llm-d-planner](https://github.com/llm-d-incubation/llm-d-planner) - [Full accuracy report with per-model tables](https://github.com/llm-d-incubation/llm-d-planner/blob/main/accuracy/accuracy_report.md) - [Run the sweep on your own cluster](https://github.com/llm-d-incubation/llm-d-planner/blob/main/accuracy/README.md) -No one should have to spin up a cluster to find out their workload doesn't fit. +Before these experiments, we had formulas. Now we have evidence and a clearer path to making the planner accurate enough to trust before you touch a cluster.